同花顺公式采集验证¶

In [ ]:
%pip install Appium-Python-Client
In [ ]:
from appium import webdriver

desired_caps = {
    "app": r'E:\同花顺\hexin.exe',
    # "appTopLevelWindow": 0x361506,
    "platformName": "Windows",
    "deviceName": "WindowsPC"
}

driver = webdriver.Remote(
    command_executor='http://127.0.0.1:4723/wd/hub',
    desired_capabilities=desired_caps
)

current_window = driver.current_window_handle
all_windows = driver.window_handles
print(all_windows)
In [ ]:
# 关闭新人福利窗口
driver.find_element_by_accessibility_id('1000').click()
In [ ]:
from appium.webdriver.common.touch_action import TouchAction
In [ ]:
driver.tap([(14,323)])
In [ ]:
win1 = driver.find_element_by_accessibility_id('59648')
In [ ]:
# TODO  点击K线图
action = TouchAction(driver)
action.tap(element=win1, x=-200, y=-200).perform()
In [ ]:
指标标签 = driver.find_element_by_accessibility_id('20556')
In [ ]:
指标标签.text
In [ ]:
主要曲线 = driver.find_element_by_name("主要曲线")
In [ ]:
主要曲线.click()
主要曲线.click()
主要曲线.click()
In [ ]:
action = TouchAction(driver)
action.tap(主要曲线).wait(10).press(主要曲线).perform()
In [ ]:
driver.find_element_by_accessibility_id("293654952").text
In [ ]:
with open("page_source.xml", 'w', encoding='utf-8') as fpw:
    fpw.write(driver.page_source)
In [ ]:
# 在这里执行其他操作

driver.quit()
In [ ]:
driver.find_element_by_name('文件(F)').click()
from time import sleep
sleep(2)
driver.find_element_by_name('保存(S)	Ctrl+S').click()
# driver.find_element_by_name('退出(X)').click()
sleep(1)
import pyautogui
pyautogui.PAUSE = 0.5
pyautogui.typewrite(r'D:\hello.txt')
pyautogui.press('enter')

1_采集列表¶

In [ ]:
import requests
from datetime import datetime, timedelta
import logging
import pandas as pd
import time

from lds_util import dump, dumps, mkpdir, get_dict

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)
excel_file = r"D:\Downloads\沪深A股_sample.xlsx"
data_frame = pd.read_excel(excel_file)
excel_data = data_frame.values.tolist()
In [ ]:
excel_data
In [ ]:
import redis 

pool = redis.ConnectionPool(host='127.0.0.1', port=6379, decode_responses=True)
redis_cli = redis.Redis(host='127.0.0.1', port=6379, password='lds', decode_responses=True, db=0)

lbty_cookie_key = "lds:lbty:cookie"
g_cookies = eval(redis_cli.get(lbty_cookie_key))

g_cookies
In [ ]:
class CookieError(RuntimeError):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)


import os

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'zh-cn',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Origin': 'https://robo.datayes.com',
    'Connection': 'keep-alive',
    'Referer': 'https://robo.datayes.com/',
    # 'Cookie': 'gr_user_id=26250982-1a9b-4973-970a-fe103ede4ed6; cloud-anonymous-token=f07471e9ffa64e23a91b5245a6642387; cloud-sso-token=E5B1067A4772527AFAE513AAEA4684BF; rrp-wechat-login-token=520c813c-81ea-4e7b-aeed-7c4ae9ece81d; _ga=GA1.2.1213974505.1692239696; _gid=GA1.2.382730469.1692239696; _ga_YBHR4XSTK7=GS1.2.1692239696.1.1.1692239872.60.0.0; ba895d61f7404b76_gr_last_sent_cs1=10618609@wmcloud.com; ba895d61f7404b76_gr_session_id=36a6d9aa-b878-4b0b-a756-7aad7dab326a; ba895d61f7404b76_gr_last_sent_sid_with_cs1=36a6d9aa-b878-4b0b-a756-7aad7dab326a; ba895d61f7404b76_gr_cs1=10618609@wmcloud.com; ba895d61f7404b76_gr_session_id_sent_vst=36a6d9aa-b878-4b0b-a756-7aad7dab326a',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}


MAX_PAGE = 8
OUTFILE_BASEDIR = "e:/萝卜投研"
OUTFILE_FROMAT = OUTFILE_BASEDIR + "/{}/{}_{}.json"
mkpdir(OUTFILE_BASEDIR + "/das")

PARAMS = {
    'pageNow': '1',
    'pageSize': '30',
    'sortField': 'effectiveTime',
    'sortOrder': 'desc',
    'isDidMount': 'true',
    'ticker': '002249',
    'sentiment': '1',
}

g_session = requests.session()

update_cookie = time.time()

def one_ticker(ticker_item: list[str]):
    global g_cookies
    ticker = ticker_item[0].split(".")[0]
    logger.info("ticker: %s", ticker_item)
    info_file = f"{OUTFILE_BASEDIR}/{ticker}/info.json"
    mkpdir(info_file)
    with open(info_file, 'w', encoding='utf-8') as fpw:
        dump(ticker_item, fpw)
        
    PARAMS["ticker"] = ticker
    for sentiment in ["1", "-1"]:
        PARAMS["sentiment"] = sentiment
        for page in range(1, MAX_PAGE + 1):
            def crawl():
                global g_cookies, PARAMS, g_session, OUTFILE_FROMAT
                outfile = OUTFILE_FROMAT.format(ticker, sentiment, page)
                if os.path.exists(outfile):
                    # logger.info("已经采集:%s", outfile)
                    return None
                    
                PARAMS["pageNow"] = page
                logger.info("%s_%s", ticker, page)
                response = g_session.get(
                    'https://gw.datayes.com/rrp_adventure/publicOpinion/getSentimentNews',
                    params=PARAMS,
                    cookies=g_cookies,
                    headers=headers,
                )
                resp_json = response.json()
                code = resp_json["code"]
                if code == 1:
                    # total_page = resp_json["data"]["total"]
                    # assert total_page is not None
                    outfile = OUTFILE_FROMAT.format(ticker, sentiment, page)
                    mkpdir(outfile)
                    logger.info("outfile: %s", outfile)
                    with open(outfile, 'w', encoding='utf-8') as fpw:
                        fpw.write(response.text)
                    time.sleep(1)
                elif code == -403 or resp_json["message"] == "Need login":
                    logger.error("请更换cookie, 距离上一次更换已经过去:%s",  time.time() - update_cookie)
                    time.sleep(120)
                    g_cookies = eval(redis_cli.get(lbty_cookie_key))
                    raise CookieError()
            try:
                crawl()
            except CookieError:
                crawl()
                
In [ ]:
for item in excel_data:
    one_ticker(item)

2_解析数据¶

In [ ]:
import json
from urllib.parse import urlparse

def get_domain(url: str):
    return urlparse(url).netloc
In [ ]:
g_domain = set()
g_siteName = set()

class Model(object):
    def __init__(self, *args, **kwargs):
        self.stockName = None
        self.stockCode = None
        self.bullish = None
        self.title = None
        self.content = None
        self.publishTime = None
        self.author = None
        
        self.industry = None
        self.url = None
        self.source = None
        self.siteName = None


@get_dict      
def parse(item, g_stock_name, g_stock_code, g_sentiment):
    '''
    解析数据
    '''
    model = Model()
    model.stockName = g_stock_name
    model.stockCode = g_stock_code
    if g_sentiment == '1':
        model.bullish = 1
    elif g_sentiment == '-1':
        model.bullish = 0
    else:
        raise ValueError(g_sentiment)
        
    model.title = item["title"]
    model.content = None
    date = datetime.fromtimestamp(item["publishTimeStm"] / 1000)
    model.publishTime = date.strftime("%Y-%m-%d %H:%M:%S")
    model.author = item.get("author")
    
    model.source = item.get("sourceName")
    model.industry = item["industry"]
    model.siteName = item["siteName"]
    model.url = item["url"]
    g_domain.add(get_domain(model.url))
    g_siteName.add(model.siteName)
    logger.info("dict: %s, domain: %s",model.__dict__, get_domain(model.url))
    return model


res = {}
# for dirpath, dirnames, filenames in os.walk(OUTFILE_BASEDIR):
#     if dirpath == 'e:/萝卜投研' or "html" in dirpath:
#         continue
#     logger.info(f"dirpath: {dirpath}")
#     g_stock_name = None
#     stock = os.path.basename(dirpath)
#     g_sentiment= None
#     with open(os.path.join(dirpath, "info.json"), 'r', encoding='utf-8') as fpr:
#         info_json = json.load(fpr)
#         g_stock_code, g_stock_name, _, _ = info_json
#     logger.info(f"g_stock_code: {g_stock_code},  g_stock_name: {g_stock_name}")
#     key1 = f'{stock}_1'
#     key2 = f'{stock}_-1'
#     res[key1] = res.get(key1, [])
#     res[key2] = res.get(key2, [])
#     for fname in filenames:
#         if 'info' in fname:
#             continue
#         else:
#             assert len(fname.split("_")) == 2
#             g_sentiment = fname.split("_")[0]
#             with open(os.path.join(dirpath, fname), 'r', encoding='utf-8') as fpr:
#                 res[f'{stock}_{g_sentiment}'].extend(map(lambda it: parse(it, g_stock_name, g_stock_code, g_sentiment), 
#                                json.load(fpr)["data"]["hits"]))
        
In [ ]:
for k, v in res.items():
    v.sort(key=lambda it: it["publishTime"], reverse=True)
with open(f"{OUTFILE_BASEDIR}/data.json", 'w', encoding='utf-8') as fpw:
    dump(res, fpw)
In [ ]:
 
In [ ]:
len(g_domain), len(g_siteName)
In [ ]:
for k, models in res.items():
    temp = []
    date_set = set()
    for model in models:
        if model["publishTime"][:10] not in date_set:
            temp.append(model)
            date_set.add(model["publishTime"][:10])
            if len(temp) == 20:
                res[k] = temp
                break
    else:
        if len(date_set) < 10:
            logger.error("数据不够啊" + k + " date_set " + str(date_set))
with open(f"{OUTFILE_BASEDIR}/data_recent10.json", 'w', encoding='utf-8') as fpw:
    dump(res, fpw)
In [ ]:
g_domain = set()
g_siteName = set()
for k, models in res.items():
    for model in models:
        g_domain.add(get_domain(model["url"]))
        g_siteName.add(model["siteName"])
In [ ]:
g_domain
In [ ]:
import traceback
from selenium.webdriver.common.by import By
import requests
from datetime import datetime, timedelta
import traceback
from selenium.webdriver.chromium.webdriver import ChromiumDriver
from selenium import webdriver

from selenium.webdriver.chromium import service



def get_driver():
    for _ in range(3):
        try:
            option = webdriver.ChromeOptions()
            option.add_experimental_option("excludeSwitches", ['enable-automation'])
            # 添加启动参数,打开开发者工具
            # option.add_argument("--auto-open-devtools-for-tabs")
            option.add_argument("--disable-cache")
            option.add_argument('--ignore-certificate-errors')
            # option.add_argument(f"--user-data-dir=d:/luobatouyan")
            option.add_experimental_option(
                'prefs',
                {
                    "download.prompt_for_download": False,
                    "download.directory_upgrade": True,
                    "plugins.always_open_pdf_externally": True,  # 这句配置很重要
                    "profile.managed_default_content_settings.images": 2,
                    "profile.managed_default_content_settings.media_stream": 2
                }
            )
            # chrome_service = service.ChromiumService(executable_path="d:/chromedriver")
            driver = webdriver.Chrome(
                options=option,
                # service=chrome_service
            )
            driver.implicitly_wait(5)
            return driver
        except:
            logger.error(traceback.format_exc())
            time.sleep(3)
    raise ValueError("创建 webdriver 失败, 请人工检查")
# g_driver = get_driver()
In [ ]:
model
In [ ]:
import hashlib
def md5_hash(text):
    # 创建MD5对象
    md5 = hashlib.md5()
    
    # 更新摘要对象的内容
    md5.update(text.encode('utf-8'))
    
    # 获取摘要结果
    result = md5.hexdigest()
    
    return result
In [ ]:
md5_hash("https://caifuhao.eastmoney.com/news/20230704112547970423200")
In [ ]:
from  concurrent.futures import ThreadPoolExecutor


def crawl_html( models):
    driver:ChromiumDriver = None
    try:
        count = 0
        for model in models:
            html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
            if os.path.exists(html_file):
                print("已经采集 ", html_file, end="\r")
                continue
            if driver is None:
                driver = get_driver()
            logger.warning("get %s", model["url"])
            driver.get(model["url"])
            page_source = driver.page_source
            with open(mkpdir(html_file), 'w', encoding='utf-8') as fpw:
                fpw.write(page_source)
            model["html"] = driver.page_source
            count += 1
    finally:
        if driver is not None:
            driver.quit()

with ThreadPoolExecutor(max_workers=5, thread_name_prefix="chrome_crawl") as executor:
    for k, models in res.items():
        executor.submit(crawl_html, models)
        
In [ ]:
import json
import bs4


OUTFILE_BASEDIR = "e:/萝卜投研"

with open(f"{OUTFILE_BASEDIR}/data_recent10.json", 'r', encoding='utf-8') as fpr:
    res = json.load(fpr)
In [ ]:
g_domain = set()
g_siteName = set()
for k, models in res.items():
    for model in models:
        g_domain.add(get_domain(model["url"]))
        g_siteName.add(model["siteName"])
In [ ]:
len(g_domain)
In [ ]:
soup: bs4.BeautifulSoup
In [ ]:
g_driver = get_driver()
In [ ]:
# g_driver = webdriver.Firefox()
In [ ]:
walked = set()
In [ ]:
del common_process
del finance_stockstar_com

3_fun¶

In [ ]:
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup as bsoup


def common_process(soup: bs4.BeautifulSoup, model, css_selector, exclude_kw=None):
    if not isinstance(css_selector, (list, tuple)):
        css_selector = [css_selector]
    for css_selector in list(css_selector):
        try:
            content = ""
            try:
                for child in soup.select_one(css_selector).children:
                    if len(child.text.strip()) >= 1 and (exclude_kw and exclude_kw not in child.text):
                        content += child.text.strip() + "\n"
                assert len(content) > 2
            except Exception as e:
                print(traceback.format_exc())
                content = ""
                g_driver.get(model["url"])
                g_driver.find_elements(By.CSS_SELECTOR,css_selector)
                time.sleep(1)
                content = "\n".join(map(lambda it: it.text, g_driver.find_elements(By.CSS_SELECTOR,css_selector)))
                html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
                with open(html_file, 'w', encoding='utf-8') as fpw:
                    fpw.write(g_driver.page_source)
            print(content)
            return content
        except:
            pass
    

def finance_eastmoney_com(soup: bs4.BeautifulSoup, model):
    try:
        if '抱歉,您访问的页面不存在或已删除' in str(soup):
            print(model["url"], '抱歉,您访问的页面不存在或已删除')
            return content
    
        for child in soup.select_one("div#ContentBody").children:
            if 'APP内免费看>>' in str(child.text) \
                or 'class="b-chart' in str(child):
                pass
            else:
                content += child.text
        print(content)
        return content
    except:
        pass
    
    try:
        g_driver.get(model["url"])
        content= "\n".join(map(lambda it: it.text,g_driver.find_elements(By.CSS_SELECTOR,
                                                                "div#ContentBody")))
        html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
        with open(html_file, 'w', encoding='utf-8') as fpw:
            fpw.write(g_driver.page_source)
        return content
        # content = soup.select_one("div.abstract").text.strip()
    except:
        content = ''
    if '抱歉,您访问的页面不存在或已删除' in str(soup):
        print(model["url"], '抱歉,您访问的页面不存在或已删除')
        return content
    
    for child in soup.select_one("div#ContentBody").children:
        if 'APP内免费看>>' in str(child.text) \
            or 'class="b-chart' in str(child):
            pass
        else:
            content += child.text
    print(content)
    return content

def caifuhao_eastmoney_com(soup: bs4.BeautifulSoup, model):
    
    try:
        content = soup.select_one("div.article-body").text.strip()
        return content
    except:
        content = ''
    if '抱歉,您访问的页面不存在或已删除' in str(soup) or '该文章已被删除' in str(soup):
        print(model["url"], '抱歉,您访问的页面不存在或已删除')
        return content
    for child in soup.select_one("div#ContentBody").children:
        if 'APP内免费看>>' in str(child.text) \
            or 'class="b-chart' in str(child):
            pass
        else:
            content += child.text
    print(content)
    return content

def yuanchuang_10jqka_com_cn(soup: bs4.BeautifulSoup, model):
    content = ""
    for child in soup.select("div.main-text p:not([class])"):
        content += child.text + "\n"
    print(content)
    return content

def gu_qq_com(soup: bs4.BeautifulSoup, model):
    content = None
    try:
        content = soup.select_one("div.news-text").text
    except:
        try:
            g_driver.get(model["url"])
            content=  g_driver.find_element(By.CSS_SELECTOR,"div.news-text").text
            html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
            with open(html_file, 'w', encoding='utf-8') as fpw:
                fpw.write(g_driver.page_source)
        except NoSuchElementException:
            return None
    print(content)
    return content

def www_wabei_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, 'div.subject-content')

def stock_hexun_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, ['div.art_contextBox','div.quick_context'])

def stockstar_com(soup: bs4.BeautifulSoup,  model):
    if '您正在搜索的页面可能已经删除' in soup.text:
        return None
    content = ""
    try:
        for child in soup.select_one("div.article_content").children:
            content += child.text + "\n"
    except:
        content = ""
        g_driver.get(model["url"])
        try:
            content=  g_driver.find_element(By.CSS_SELECTOR,"div.article_content").text
            html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
            with open(html_file, 'w', encoding='utf-8') as fpw:
                fpw.write(g_driver.page_source)
        except NoSuchElementException:
            return None
    print(content)
    return content

def sina_com_cn(soup: bs4.BeautifulSoup, model):
    content = ""
    for child in soup.select("div#artibody p"):
        content += child.text + "\n"
    print(content)
    return content

def gelonghui_com(soup: bs4.BeautifulSoup,  model):
    return common_process(soup, model, ["article", "div.dtb-content"])

def www_abi_com_cn(soup: bs4.BeautifulSoup, model):
    content = ""
    try:
        for child in soup.select_one("section.textblock").children:
            content += child.text + "\n"
    except:
        content = ""
        g_driver.get(model["url"])
        content=  g_driver.find_element(By.CSS_SELECTOR,"section.textblock").text
        html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
        with open(html_file, 'w', encoding='utf-8') as fpw:
            fpw.write(g_driver.page_source)
    print(content)
    return content


def www_sohu_com(soup: bs4.BeautifulSoup, model):
    css_sel = "article.article"
    if soup.select_one("section.lookall"):
        content = ""
        g_driver.get(model["url"])
        try:
            g_driver.find_element(By.CSS_SELECTOR, 'section.lookall').click()
            print("="*50, "查看全文")
        except Exception as e:
            print(traceback.format_exc())
        content=  g_driver.find_element(By.CSS_SELECTOR,css_sel).text
        html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
        with open(html_file, 'w', encoding='utf-8') as fpw:
            fpw.write(g_driver.page_source)
    else:
        content = common_process(soup, model, css_sel) 
    print(content)
    return content

def www_laoyaoba_com(soup: bs4.BeautifulSoup, model):
    if '很抱歉,您访问的新闻不存在' in soup.text:
        return None
    css_sel = "div.media-article-content"
    if model["source"] is None:
        source = soup.select_one("div.media-source span").text
        if '来源:' in source:
            model["source"] = source.replace('来源:', "").strip()
    content = common_process(soup, model, css_sel) 
    print(content)
    return content

def jrj_com_cn(soup: bs4.BeautifulSoup, model):
    css_sel = "div.article_content"
    if model["source"] is None:
        if soup.select_one("div.article_info i"):
            source = soup.select_one("div.article_info i").text
            if '来源:' in source:
                model["source"] = source.replace('来源:', "").strip()
    content = common_process(soup, model, css_sel) 
    print(content)
    return content

def www_cls_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content-main-box div.detail-content")

def www_ccement_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.word")

def www_guandian_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.con_l_inner")

def www_lejucaijing_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.detail-content")

def hexun_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.art_contextBox")

def irnews_cfbond_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.tab-item")

def ggjd_cnstock_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.qmt_content_div")

def cs_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "article.cont_article section")

def ifeng_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content-info")

def new_qq_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#ArticleContent")

def news_winshang_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.win-news-content")

def www_meadin_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.out-article")

def www_traveldaily_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.articleContent")

def www_bjnews_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#contentStr div")

def xueqiu_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "article.article__bd")

def www_p5w_net(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content-text")

def www_zhitongcaijing_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.news-content div.news-body-content")

def caijing_com_cn(soup: bs4.BeautifulSoup, model):
    if 'https://file.caijing.com.cn/404.gif' in str(soup.select("div.center")):
        print("404", model["url"])
        return None
    return common_process(soup, model, "div.article-content")

def xuangubao_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, ".article_1Q_lp")

def www_jwview_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content_zw")

# def lights_ofweek_com(soup: bs4.BeautifulSoup, model):
#     return common_process(soup, model, "div.artical")

def ofweek_com(soup: bs4.BeautifulSoup, model):
    if soup.select_one("div#content-404") and not soup.select_one("div.artical"):
        print("404", model["url"])
        return None
    return common_process(soup, model, "div.artical")

def preview_ylaigc_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "section.content")

def www_jiemian_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.article-content")

def www_cb_com_cn(soup: bs4.BeautifulSoup, model):
    if len(soup.text.strip()) == 0:
        print("webdriver 失效")
        return None
    return common_process(soup, model, "div.content_page")

def www_eeo_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.xd-nr div.xx_boxsing")

def www_bbtnews_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#pageContent")

def cnfol_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.Article" ,exclude_kw="由证券之星根据公开信息整理")

def stock_cnfol_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.Article" ,exclude_kw="由证券之星根据公开信息整理")

def company_cnstock_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#qmt_content_div")

def cnstock_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#qmt_content_div")

def finance_ce_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#articleText")

def www_portcontainer_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.nei tbody tbody tbody")

def www_yicai_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.m-txt")

def www_163_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.post_body")

def www_tmtpost_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "article")

def www_nbd_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.g-articl-text")

def _36kr_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.articleDetailContent")

def flash_jin10_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.detail-content")

def news_smm_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.news-detail-main-article")

def www_cqcb_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.sycontent")

def bjx_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.cc-article")

def www_elecfans_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.rticle-content")

def info_texnet_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.detail-text")

def www_mydigit_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.pcb table")

def sigmaintell_com(soup: bs4.BeautifulSoup, model):
    return None
    # return common_process(soup, model, "TODO")

def mp_weixin_qq_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#js_content")

def news_hea_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content-wrap")

def news_cheaa_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.article")

def www_techweb_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#content")

def news_mydrivers_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.news_info")

def www_chinatimes_net_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content")

def finance_china_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#fontzoom")

def _3bf_cc(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.contTxt")

def static_hongzhoukan_com(soup: bs4.BeautifulSoup, model):
    if '404 Not Found' in soup.text:
        return None
    assert 0, "TODO"
    

def www_mei_net_cn(soup: bs4.BeautifulSoup, model):
    assert 0, "手动处理吧"

def www_zqrb_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.news_content")

def www_cnmn_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#txtcont")

def news_leju_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.detail-content")

def www_iyiou_com(soup: bs4.BeautifulSoup, model):
    return None


def www_evpartner_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#newscontent")


def www_cnautonews_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.articleDetailContent")


def www_dcement_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.conTxt")

def www_cnfol_hk(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#Content")

def api3_cls_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "section.content-box")

def www_yemacaijing_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.wangEditor-container")

def news_mysteel_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content")

def www_fx678_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.article-cont")

def www_powerpigs_net(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#content")

def www_soozhu_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.article_body")

def www_jiemodui_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "article.content")

def info_chineseshipping_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.TRS_Editor")

def www_thepaper_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.index_cententWrap__Jv8jK")

def www_escn_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "span#detailContent")

def www_d1ev_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.ws-md")

def www_china5e_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#articleBox")

def www_zyzhan_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.contentText")

def www_ebrun_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.post-text")

def www_semi_org_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.single-page-post")

def www_dsb_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.article-content")

def www_changjiangtimes_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.c_zw", exclude_kw='CJTIMES_GET')

def www_time_weekly_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "article.main_article")

def www_dzzq_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.article-content")

def www_investorscn_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content-text")

def www_ccmn_cn(soup: bs4.BeautifulSoup, model):
    return None

def www_lanjinger_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.lj-layout-main")

def www_linkshop_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "article.page")

def www_gzdaily_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.detailNews  div.mianConLeft > div.ng-binding")

def h5_newaircloud_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.TRS_Editor")

def paper_cnstock_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.article-content")

def www_csrc_gov_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content")

def www_jjckb_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.mainCon")

def www_cceep_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.text")

def www_ifnews_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.centerText")

def www_mscbsc_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#articlebody")

def www_nmpa_gov_cn(soup: bs4.BeautifulSoup, model):
    assert 0, '手动'
    return common_process(soup, model, "div#articlebody")

def www_caiguu_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#newscontent")

def www_eefocus_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.article-content")

def www_chinaipo_com(soup: bs4.BeautifulSoup, model):
    return None
    # assert 0, 'TODO'
    # return common_process(soup, model, "div.article-content")

def www_gg_lb_com(soup: bs4.BeautifulSoup, model):
    if 'cannot connect server' in soup.text:
        return None
    assert 0, 'TODO'
    return common_process(soup, model, "div.article-content")

def news_eccn_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.Newsdetaile")

def stock_eastmoney_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#ContentBody")

def futures_eastmoney_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#ContentBody")

def www_szse_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#desContent")

def house_ifeng_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content-info")

def www_financialnews_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content_body")


def huanqiu_com(soup: bs4.BeautifulSoup, model):
    return bsoup(soup.select_one("textarea.article-content").text).text

def news_pedaily_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.news-content")

def www_gongkong_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.summary")

def www_chinaports_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#news_content")

def www_port_org_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.info_content")

def m_mp_oeeee_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#BodySection")

def society_people_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.rm_txt_con")


def auto_gasgoo_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.contentDetailed")

def www_guancha_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.all-txt")


def news_21_sun_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content")


def www_cninct_com(soup: bs4.BeautifulSoup, model):
    return None
    # return common_process(soup, model, "div.content")

def www_news_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#detail")

def news_pharmnet_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content")

def www_cpnn_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.cpnnlist")

def www_97caijing_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "article")

def www_cbminfo_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.box-news table")

def www_96369_net(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.main-content")

def www_ledinside_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.node_body")

def www_eaonline_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "body > div:nth-child(4) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > table:nth-child(1) > tbody:nth-child(1)")

def news_ccmn_cn(soup: bs4.BeautifulSoup, model):
    assert 0, 'TODO'
    return common_process(soup, model, "TODO")

def www_aicaijing_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.ant-spin-container .ArticleDetail-module__content--3Pe68")

def news_carnoc_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#newstext")

def www_ccaonline_cn(soup: bs4.BeautifulSoup, model):
    if '未找到页面 - 404' in soup.text:
        return None
    return common_process(soup, model, "TODO")

def gas_in_en_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#article")

def www_linkshop_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "article.page section")

def www_texindex_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.necontent1")

def www_huxiu_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#article-content")

def www_sxcoal_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content")

def people_com_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.rm_txt_con", exclude_kw='.tjewm')

def www_cas_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.xl_content")

def www_canyin88_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.content-editor")

def www_cnlist_com(soup: bs4.BeautifulSoup, model):
    if 'The proxy server received an invalid response' in soup.text:
        return None
    assert 0, 'TODO'
    return common_process(soup, model, "TODO")

def www_xxsb_com(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div#content")

def tech_gmw_cn(soup: bs4.BeautifulSoup, model):
    return common_process(soup, model, "div.u-mainText")

_fun end¶

In [ ]:
model
In [ ]:
html_file
In [ ]:
g_driver.get(model["url"])
"\n".join(map(lambda it: it.text,g_driver.find_elements(By.CSS_SELECTOR,
                                                        "div#content")))
In [ ]:
soup.text
In [ ]:
content = ""
for child in soup.select_one("div.u-mainText ").children:
    if len(child.text.strip()) > 0 and '.tjewm' not in child.text:
        content += child.text.strip() + "\n"
print(content)
In [ ]:
walked.add(html_file)
In [ ]:
print(model, '\n',model["url"], end='\r')
In [ ]:
domain_parse_fun = {
    "finance.eastmoney.com": finance_eastmoney_com,
    "yuanchuang.10jqka.com.cn": yuanchuang_10jqka_com_cn,
    "gu.qq.com": gu_qq_com,
    "www.wabei.cn": www_wabei_cn,
    "stock.hexun.com": stock_hexun_com,
    "hexun.com": stock_hexun_com,
    "finance.stockstar.com": stockstar_com,
    "10jqka.com.cn": yuanchuang_10jqka_com_cn,
    "finance.sina.com.cn": sina_com_cn,
    "sina.com.cn": sina_com_cn,
    "www.gelonghui.com": gelonghui_com,
    "stockstar.com": stockstar_com,
    "www.abi.com.cn": www_abi_com_cn,
    "www.sohu.com": www_sohu_com,
    "www.laoyaoba.com": www_laoyaoba_com,
    "www.cls.cn": www_cls_cn,
    "stock.jrj.com.cn": jrj_com_cn,
    "www.ccement.com": www_ccement_com,
    "www.guandian.cn": www_guandian_cn,
    "www.lejucaijing.com": www_lejucaijing_com,
}
def parse_html(model: str, soup) -> str:
    return domain_parse_fun[get_domain(model["url"])](soup)
In [ ]:
errors = []
count = 0

4_refresh¶

In [ ]:
from bs4 import BeautifulSoup as bsoup
count = len(walked)
_globals = globals()
for k, models in res.items():
    for model in models:
        
        html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
        if html_file in walked:
            continue
        elif model["content"] and len(model["content"]) > 5:
            walked.add(html_file)
            continue
        elif os.path.exists(html_file):
            try:
                print(count, "html_file:", html_file)
                
                def can_parse():
                    domain = get_domain(model["url"])
                    if domain.endswith("10jqka.com.cn"):
                        return domain_parse_fun["10jqka.com.cn"]
                    elif domain.endswith("stockstar.com"):
                        return domain_parse_fun["stockstar.com"]
                    elif domain.endswith("sina.com.cn"):
                        return domain_parse_fun["sina.com.cn"]
                    else:
                        domain_splits = domain.replace("-", '_').split(".")
                        return _globals.get('_'.join(domain_splits)) \
                            or _globals.get("_".join(domain_splits[1:]))\
                            or _globals.get("_" + "_".join(domain_splits))\
                            or _globals.get("_" + "_".join(domain_splits[1:]))
                fun = can_parse()
                if model["content"]  is not None and len(model["content"]) > 1:
                    walked.add(html_file)
                    count += 1
                    print("*" * 30)
                    continue
                elif not fun:
                    with open(html_file, 'r', encoding='utf-8') as fpr:
                        soup = bsoup(fpr.read())
                    raise KeyError
                elif html_file in walked:
                    print("^" * 5, count,"skip:", html_file)
                else:
                    with open(html_file, 'r', encoding='utf-8') as fpr:
                        soup = bsoup(fpr.read())
                    model["content"] = fun(soup, model)
                    walked.add(html_file)
                    print("=" * 5,"add", count, html_file)
                count += 1
                if count % 50 == 0:
                    with open(f"{OUTFILE_BASEDIR}/data_recent_08-22.json", 'w', encoding='utf-8') as fpw:
                        dump(res, fpw)
            except:
                errors.append(model)
        else:
            print("*" * 5, html_file)
In [ ]:
from bs4 import BeautifulSoup as bsoup

count = len(walked)
_globals = globals()
for k, models in res.items():
    for model in models:
        html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
        if html_file not in walked and model in errors:
            if os.path.exists(html_file):
                try:
                    print(count, "html_file:", html_file)
                    
                    def can_parse():
                        domain = get_domain(model["url"])
                        if domain.endswith("10jqka.com.cn"):
                            return domain_parse_fun["10jqka.com.cn"]
                        elif domain.endswith("stockstar.com"):
                            return domain_parse_fun["stockstar.com"]
                        elif domain.endswith("sina.com.cn"):
                            return domain_parse_fun["sina.com.cn"]
                        else:
                            domain_splits = domain.replace("-", '_').split(".")
                            return _globals.get('_'.join(domain_splits)) \
                                or _globals.get("_".join(domain_splits[1:]))\
                                or _globals.get("_" + "_".join(domain_splits))\
                                or _globals.get("_" + "_".join(domain_splits[1:]))
                    fun = can_parse()
                    if model["content"]:
                        walked.add(html_file)
                    elif not fun:
                        with open(html_file, 'r', encoding='utf-8') as fpr:
                            soup = bsoup(fpr.read())
                        raise KeyError
                    elif html_file in walked:
                        print("^" * 5, count,"skip:", html_file)
                    else:
                        with open(html_file, 'r', encoding='utf-8') as fpr:
                            soup = bsoup(fpr.read())
                        model["content"] = fun(soup, model)
                        walked.add(html_file)
                        print("=" * 5,"add", count, html_file)
                    count += 1
                    if count % 50 == 0:
                        with open(f"{OUTFILE_BASEDIR}/data_recent_08-22.json", 'w', encoding='utf-8') as fpw:
                            dump(res, fpw)
                    errors.remove(model)
                except Exception as e:
                    # print(traceback.format_exc())
                    errors.append(model)
                    raise e
            else:
                print("*" * 5, html_file)
In [29]:
 
In [26]:
data = []
ct = 0
for k, models in res.items():
    for model in models:
        if model["content"] and len(model["content"]) > 5:
            ct += 1
            if model["source"] is None:
                model["source"] = model["siteName"]
            data.append(model)
print('count', len(data))
with open(f"{OUTFILE_BASEDIR}/out_08-22.json", 'w', encoding='utf-8') as fpw:
    dump(data, fpw)
count 13701
In [28]:
import pandas as pd
pd.DataFrame(data).to_excel(f"E:/萝卜投研/萝卜投研.xlsx", index=False)
In [ ]:
with open(f"{OUTFILE_BASEDIR}/data_recent10.json", 'w', encoding='utf-8') as fpw:
    dump(res, fpw)
In [ ]:
soup.select_one("body")
In [ ]:
has_add = set()
out_cout = 0
In [ ]:
ind = 0
need = 0

todo = dict()
for k, models in res.items():
    ct = 0
    for model in models:
        if model["content"] and len(model["content"]) > 6:
            ct += 1
    if ct < 10:
        ind += 1
        need += 10 - ct
        print(ind, k, 10 - ct)
        todo[k] = models
print('need',need)
In [ ]:
todo
In [ ]:
for k, models in todo.items():
    stock_code , g_sentiment = k.split("_")
    bdir = os.path.join(OUTFILE_BASEDIR, stock_code)
    for name in os.listdir(bdir):
        if name.startswith(g_sentiment):
            g_stock_name = None
            stock = os.path.basename(bdir)
            with open(os.path.join(bdir, "info.json"), 'r', encoding='utf-8') as fpr:
                info_json = json.load(fpr)
                g_stock_code, g_stock_name, _, _ = info_json
            with open(os.path.join(bdir, name), 'r', encoding='utf-8') as fpr:
                to_add = map(lambda it: parse(it, g_stock_name, g_stock_code, g_sentiment), 
                                    json.load(fpr)["data"]["hits"])
                all_urls = set(map(lambda it: it["url"], models))
                models.extend(filter(lambda it: it['url'] not in all_urls, to_add))
    print(k, len(models))
In [ ]:
 
In [ ]:
for k, models in res.items():
    for model in models:
        
            
        domain = get_domain(model["url"])
        if domain in has_add:
            continue
        
        html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
        if not os.path.exists(html_file):
            continue
        
        with open(html_file, 'r', encoding='utf-8') as fpr:
            soup = bsoup(fpr.read())
        
        if soup.select_one("body") and model["title"]  in soup.select_one("body").text:
            out1 = {}
            out1["url"] = model["url"]
            text = soup.select_one("body").text
            out1["txt"] = text[text.index(model["title"]):]
            with open(f'e:/to_check/{domain}.json', 'w', encoding='utf-8') as fpw:
                out_cout += 1
                print(out_cout)
                has_add.add(domain)
                fpw.write(dumps(out1))
        else:
            print(model["title"], model["url"])