In [5]:
import pip
try:
selenium
except:
pip.main(['install', 'selenium'])
print("install selenium success!")
import selenium
from bs4 import BeautifulSoup as bsoup
from selenium import webdriver
import time
import re
import json
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait #WebDriverWait注意大小写
from selenium.webdriver.common.by import By
option = webdriver.ChromeOptions()
option.add_argument("--disable-cache")
option.add_argument("--disable-javascript")
option.set_capability('pageLoadStrategy', 'none')
option.add_experimental_option('excludeSwitches', ['enable-automation'])
prefs = {
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True,
"profile.managed_default_content_settings.images": 2,
"profile.managed_default_content_settings.media_stream": 2
}
option.add_experimental_option('prefs', prefs)
# driver = webdriver.Chrome(
# executable_path="d:/chromedriver.exe", chrome_options=option)
def get_driver():
# driver = webdriver.Chrome(options=option)
driver = webdriver.Chrome(executable_path="d:/chromedriver.exe", chrome_options=option)
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument',
{'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'})
driver.implicitly_wait(5)
return driver
driver = get_driver()
driver.get('https://longportapp.com/topics/8839770')
wait = WebDriverWait(driver, 10)
element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "article p")))
from bs4 import BeautifulSoup as bsoup
soup = bsoup(driver.page_source)
soup.select("article")
d:\Users\Administrator\anaconda3\envs\crawler\lib\site-packages\ipykernel_launcher.py:39: DeprecationWarning: use options instead of chrome_options
Out[5]:
[<article><h1 class="mb-5 text-2xl font-semibold text-black">微创医疗的资本版图乱弹</h1><div><div class="topic-rich-content"><p style="margin-left:0px">近期微创医疗再次引起市场的关注。对于这样一家致力于生物医药高科技的公司,近年来已经分拆多家子公司上市。</p><p style="margin-left:0px">其实最早是药明康德开启的这一路径。从药明康德,到药明巨诺,药明生物,药明合联等,药明系上市公司众多。母公司将某一在细分行业具有明显优势的子公司分拆,并运作其上市,将对母公司和子公司的估值都有较好的影响。</p><p style="margin-left:0px">自从药明系开启了先河之后,很多其他的公司也开始陆续跟上。乐普医疗相继拆分乐普诊断、乐普生物、心泰医疗等子公司冲击 IPO;辽宁成大也顺利拆分成大生物登陆 A 股;华大基因旗下的华大智造,长春高新下的百克生物,都很好的实践了药明康德的路径。</p><p style="margin-left:0px">而在这整个过程中做得做好的,恐怕非微创医疗莫属。</p><figure class="image"><img original-src="https://pub.lbkrs.com/uploads/2023/b527dccebf682bdf48630a1b4dac58b6" src="https://pub.lbkrs.com/uploads/2023/b527dccebf682bdf48630a1b4dac58b6?x-oss-process=style/lg"/></figure><p style="margin-left:0px">到目前为止,微创医疗已经相继拆分了心脉医疗、心通医疗、微创机器人、微电生理-U,微创脑科学等五家公司成功在 A 股和 H 股上市。并起,微创医疗明确表示,未来公司将培育出一批优秀的公司并推动它们 IPO 上市。</p><p style="margin-left:0px">微创医疗表示,目前公司旗下的 12 个细分板块,未来都会有一家上市公司作为旗舰。</p><p style="margin-left:0px">2023 年 5 月 18 日,微创医疗拟再度分拆子公司微创心律在港交所 IPO,拟于主板上市。</p><p style="margin-left:0px">从微创医疗的资本版图来看,公司作为一家不断探索前沿医疗行业的企业,一直都在努力地借助资本市场的力量,实现自身价值的最大化。</p><p style="margin-left:0px">不过对于投资者而言,这些被分拆出来的公司,都还有一定的风险。</p><p style="margin-left:0px">微创医疗能够通过资本化顺利地退出。但是这些公司的运营还需要市场的检验。由于是前沿技术,很多技术的价格和市场化推进都比较缓慢。所以公司业绩很难达到预期。</p><p style="margin-left:0px">不过公司 IPO 之后,与微创医疗的关系就没有那么大了。即便是标的公司股价腰斩,可能微创医疗早已经在股价还比较高的时候就已经收回了成本。</p><p style="margin-left:0px">不过微创医疗的动作,可以作为一个市场的风向标。</p><p style="margin-left:0px">就像韩国一直都有世界经济金丝雀的说法,微创医疗的资本市场动作,分拆子公司 IPO 也可以认为是市场对行业或者板块关注度逐渐转好的一个信号。毕竟,市场好的时候才可能有更多的投资者为高科技产品买单。</p><p style="margin-left:0px">按照港股的审核流程,预计微创心律可能在年底或者明年年初就可以顺利上市。</p><p style="margin-left:0px">如果真的是这样的话,那么是不是也可以认为微创医疗看好了今年后期港股市场对生物医药反弹的偏爱呢?这还是挺值得期待的。<span class="security-tag" counter_id="ST/HK/853" name="微创医疗" trend="2" type="security-tag">$微创医疗.HK</span></p></div></div><div class="h-0 overflow-hidden" id="topic-images"><div class="images-gallery_images-gallery__R1kAR"><ul class="inline-flex flex-wrap -mb-2 w-52 sm:w-96"><li class="hairline-border relative cursor-pointer gallery-image mr-2 mb-2 only-one last-image"><div class="h-full"><div class="relative h-full image-item"><img alt="0" src="https://pub.lbkrs.com/uploads/2023/b527dccebf682bdf48630a1b4dac58b6?x-oss-process=style/md"/></div></div></li></ul><div class="hidden"><div class="lb-image"><img class="lb-image-img" src="https://pub.lbkrs.com/uploads/2023/b527dccebf682bdf48630a1b4dac58b6?x-oss-process=style/2048"/><div class="lb-image-mask"><div class="lb-image-mask-info"><span aria-label="eye" class="lb lb-eye" role="img"><svg aria-hidden="true" data-icon="eye" fill="currentColor" focusable="false" height="1em" viewbox="64 64 896 896" width="1em"><path d="M942.2 486.2C847.4 286.5 704.1 186 512 186c-192.2 0-335.4 100.5-430.2 300.3a60.3 60.3 0 000 51.5C176.6 737.5 319.9 838 512 838c192.2 0 335.4-100.5 430.2-300.3 7.7-16.2 7.7-35 0-51.5zM512 766c-161.3 0-279.4-81.8-362.7-254C232.6 339.8 350.7 258 512 258c161.3 0 279.4 81.8 362.7 254C791.5 684.2 673.4 766 512 766zm-4-430c-97.2 0-176 78.8-176 176s78.8 176 176 176 176-78.8 176-176-78.8-176-176-176zm0 288c-61.9 0-112-50.1-112-112s50.1-112 112-112 112 50.1 112 112-50.1 112-112 112z"></path></svg></span>Preview</div></div></div></div></div></div></article>]
In [2]:
%pip install redis
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Collecting redis Downloading https://pypi.tuna.tsinghua.edu.cn/packages/20/2e/409703d645363352a20c944f5d119bdae3eb3034051a53724a7c5fee12b8/redis-4.6.0-py3-none-any.whl (241 kB) ---------------------------------------- 0.0/241.1 kB ? eta -:--:-- - -------------------------------------- 10.2/241.1 kB ? eta -:--:-- ---- -------------------------------- 30.7/241.1 kB 330.3 kB/s eta 0:00:01 --------- --------------------------- 61.4/241.1 kB 469.7 kB/s eta 0:00:01 ---------------- ------------------- 112.6/241.1 kB 595.3 kB/s eta 0:00:01 ---------------------- ------------- 153.6/241.1 kB 654.6 kB/s eta 0:00:01 --------------------------------- -- 225.3/241.1 kB 811.5 kB/s eta 0:00:01 ------------------------------------ 241.1/241.1 kB 821.7 kB/s eta 0:00:00 Collecting async-timeout>=4.0.2 Using cached https://pypi.tuna.tsinghua.edu.cn/packages/d6/c1/8991e7c5385b897b8c020cdaad718c5b087a6626d1d11a23e1ea87e325a7/async_timeout-4.0.2-py3-none-any.whl (5.8 kB) Installing collected packages: async-timeout, redis Successfully installed async-timeout-4.0.2 redis-4.6.0 Note: you may need to restart the kernel to use updated packages.
In [ ]:
i
#!/bin/bash
LOGS_DIR="/vdb/logs/commonSpider/" # 替换为你的 logs 文件夹路径
BACKUP_DIR="/vdb/output_backup/"
FILE_SUFFIX=*.log
delete_logs() {
local dir="$1"
local current_date=$(date +%Y-%m-%d)
echo "enter $1"
if [[ "$1" != /vdb* ]]; then
return
fi
# 遍历文件夹下的所有文件和子文件夹
for file in "$dir"/*; do
if [[ -d "$file" ]]; then
# 递归调用函数处理子文件夹
delete_logs "$file"
elif [[ -f "$file" && "$file" == $FILE_SUFFIX ]]; then
# 提取文件名中的日期部分
date=$(basename "$file" | grep -oP '\d{4}-\d{2}-\d{2}')
# 检查日期是否不等于当前日期,如果是则删除文件
if [[ $date != $current_date ]]; then
echo "Deleted file: $file"
rm "$file"
fi
else
echo "$file" == $FILE_SUFFIX
fi
done
}
# 调用函数删除日志文件
delete_logs "$LOGS_DIR"
delete_backup(){
local dir="$1"
echo "enter $1"
# 遍历文件夹下的所有文件和子文件夹
for file in "$dir"/*; do
if [[ -d "$file" ]]; then
# 递归调用函数处理子文件夹
delete_backup "$file"
elif [[ -f "$file" && "$file" == *.html ]]; then
echo "Deleted file: $file"
rm "$file"
else
echo "$file 不符合条件"
fi
done
}
delete_backup "$BACKUP_DIR"
In [ ]:
In [ ]:
it: str = it.decode()
print(it)
#redis_client.srem()
In [ ]:
#TODO
redis_password = 'r1oKi8lp9mBGzeOdqBt'
master_name = 'mymaster'
sentinels = "10.195.24.179:26379,10.210.8.218:26379,10.210.8.219:26379".split(",")
def in_set(task_code:str, all_keys: set) -> str:
for it in all_keys:
if task_code in it:
return it
return None
#TODO
def biz_fun(redis_client: 'redis.Redis'):
all_keys = set()
for it in redis_client.smembers("activetopic"):
it: str = it.decode()
all_keys.add(it)
print(len(all_keys))
with open("remove_task_code.txt", 'a', encoding='utf-8') as fpw:
with open("task_codes.txt", 'r', encoding='utf-8') as fpr:
ind = 0
for line in fpr:
task_code = line.strip()
it = in_set(task_code, all_keys)
if task_code and it:
ind += 1
print(ind, task_code, it)
fpw.write(f"{ind},{task_code},{it}\n")
redis_client.srem("activetopic", it)
break
redis_client.sadd("activetopic", 'task_queue:gwc_full_init.line_1@gwc_full_init.task_1')
try:
import redis
except ImportError:
import pip
pip.main(['install', 'redis'])
import redis
from redis.sentinel import Sentinel
sentinels = [(it.split(":")[0].strip(), int(it.split(":")[1])) for it in sentinels]
print('sentinels', sentinels)
# 创建哨兵对象
sentinel = Sentinel(sentinels, socket_timeout=2, password=redis_password)
# 获取主节点连接
master = sentinel.master_for(
master_name, socket_timeout=2, password=redis_password)
redis_client = master
try:
biz_fun(redis_client)
finally:
redis_client.close()
In [ ]:
#TODO
redis_password = 'r1oKi8lp9mBGzeOdqBt'
master_name = 'mymaster'
sentinels = "10.195.24.179:26379,10.210.8.218:26379,10.210.8.219:26379".split(",")
def in_set(task_code:str, all_keys: set) -> str:
for it in all_keys:
if task_code in it:
return it
return None
#TODO
def biz_fun(redis_client: 'redis.Redis'):
all_keys = set()
key_name = "gwc_full:next_task_code"
for it in redis_client.smembers(key_name):
it: str = it.decode()
all_keys.add(it)
print(len(all_keys))
with open("remove_task_code_gwc_full.txt", 'a', encoding='utf-8') as fpw:
with open("task_codes.txt", 'r', encoding='utf-8') as fpr:
ind = 0
for line in fpr:
task_code = line.strip()
it = in_set(task_code, all_keys)
if task_code and it:
ind += 1
print(ind, task_code, it)
fpw.write(f"{ind},{task_code},{it}\n")
redis_client.srem(key_name, it)
break
redis_client.sadd(key_name, 'gwc_full_init.task_1')
try:
import redis
except ImportError:
import pip
pip.main(['install', 'redis'])
import redis
from redis.sentinel import Sentinel
sentinels = [(it.split(":")[0].strip(), int(it.split(":")[1])) for it in sentinels]
print('sentinels', sentinels)
# 创建哨兵对象
sentinel = Sentinel(sentinels, socket_timeout=2, password=redis_password)
# 获取主节点连接
master = sentinel.master_for(
master_name, socket_timeout=2, password=redis_password)
redis_client = master
try:
biz_fun(redis_client)
finally:
redis_client.close()