一、同花顺(App)
1、先爬取同花顺首页推荐的所有用户的个人主页;划10000次翻页;
2、访问这些个人主页链接,通过个人主页--动态(只要图文)爬取贴子。
In [ ]:
from datetime import datetime
begin_date = "2023-08-01 04:00:00"
date = datetime.strptime(begin_date, '%Y-%m-%d %H:%M:%S')
formatted_date = date.strftime('%a, %d %b %Y %H:%M:%S GMT')
print('begin_date', begin_date, formatted_date)
import requests
import time
import os
import json
def parse_resp(resp_json):
global authors,headers
file_name = datetime.strptime(headers["If-Modified-Since"], '%a, %d %b %Y %H:%M:%S GMT')\
.strftime('%Y-%m-%d_%H%M%S.json')
with open(f'{basedir}/{file_name}', 'w', encoding='utf-8') as fpw:
fpw.write(response.text)
total = resp_json.get("total")
pageItems = resp_json.get("pageItems")
for item in pageItems:
for info in item["combination"]:
if "author" in info:
author = json.dumps(info, ensure_ascii=False)
if author not in authors:
authors.append(author)
print("="*30)
print("add author", author)
print("authors len:", len(authors))
headers = {
'Host': 'recommend.10jqka.com.cn',
'Connection': 'Keep-Alive',
# 'Accept-Encoding': 'gzip',
'If-Modified-Since': formatted_date,
'User-Agent': 'Hexin_Gphone/10.80.01 (Royal Flush) hxtheme/0 innerversion/G037.08.773.1.32 followPhoneSystemTheme/0 userid/-686312585 getHXAPPAccessibilityMode/0 hxNewFont/1 isVip/0 getHXAPPFontSetting/normal getHXAPPAdaptOldSetting/0 okhttp/3.14.9',
}
params = {
'reqType': '4',
'plat': 'g',
'time': '0',
'userid': '686312585',
'version': 'G037.08.773',
'gid': '1',
'mo': '0',
'm2': '',
'm1a': '7ddac4a9d9185ba791e6f2c318fe03b8',
'm1': '7031c737a36ed0a1',
'm6': '',
'm6a': '',
'nn': '同花顺',
'm5': '',
'm0': '',
'm0a': '',
}
response = requests.get('https://recommend.10jqka.com.cn/feed/api/v3/index', params=params, headers=headers)
next_date = response.headers["Date"]
parse_resp(response.json())
basedir = "msgs"
if not os.path.exists(basedir):
os.makedirs(basedir)
print("创建:", basedir)
def dump_authors():
with open("authors.json", 'w', encoding='utf-8') as fpw:
json.dump(list(authors),fpw)
def load_authors():
if not os.path.exists("authors.json"):
return []
else:
with open("authors.json", 'r', encoding='utf-8') as fpr:
return json.load(fpr)
authors = load_authors()
print("*"*10, len(authors), '*'*10)
def dump_to_file(path, content):
with open(path, 'w', encoding='utf-8') as fpw:
fpw.write(content)
def load_from_file(path):
if not os.path.exists(path):
return None
else:
with open(path, 'r', encoding='utf-8') as fpr:
return fpr.read().strip()
In [ ]:
import traceback
print("*"*10, 'len(authors):', len(authors), '*'*10)
for _ in range(100):
try:
print('next_date', next_date)
headers["If-Modified-Since"] = next_date or load_from_file("next_date")
assert headers["If-Modified-Since"] is not None
response = requests.get('https://recommend.10jqka.com.cn/feed/api/v3/index', params=params, headers=headers)
response.raise_for_status()
resp_json = response.json()
parse_resp(resp_json)
next_date = response.headers["Date"]
if len(authors) % 5==0:
dump_authors()
time.sleep(5.5)
dump_to_file("next_date", next_date)
except:
traceback.print_exc()
time.sleep(15.5)
dump_authors()
获取个人动态信息¶
In [ ]:
import requests
import logging
from logging import handlers
import sys
import json
import time
import traceback
if not os.path.exists('d:/logs'):
print("创建: ", 'd:/logs')
os.mkdir('d:/logs')
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
fh = handlers.TimedRotatingFileHandler(filename=f"d:/logs/tonghuashun.log", when='midnight', interval=1, backupCount=5)
fh.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
handlers = [fh, ch]
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] [%(module)s-%(threadName)s] %(lineno)d - %(message)s',
handlers=handlers
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
basedir = "articles"
if not os.path.exists(basedir):
os.makedirs(basedir)
logger.info("创建:%s", basedir)
def load_authors():
if not os.path.exists("authors.json"):
return []
else:
with open("authors.json", 'r', encoding='utf-8') as fpr:
return json.load(fpr)
In [ ]:
headers = {
'Host': 't.10jqka.com.cn',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'hexin-v': 'A9JDyq714qSGTx5Yc3nrOLZ3IpO049Z9COfKoZwr_gVwr30JhHMmjdh3GrZv',
'User-Agent': 'Mozilla/5.0 (Linux; Android 13; 22081212C Build/TKQ1.220829.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/104.0.5112.97 Mobile Safari/537.36 Hexin_Gphone/10.83.03 (Royal Flush) hxtheme/0 innerversion/G037.08.791.1.32 followPhoneSystemTheme/0 userid/686221341 getHXAPPAccessibilityMode/0 hxNewFont/1 isVip/1 getHXAPPFontSetting/normal getHXAPPAdaptOldSetting/0',
'Content-Type': 'application/json',
'Accept': '*/*',
'Origin': 'https://t.10jqka.com.cn',
'X-Requested-With': 'com.hexin.plat.android',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://t.10jqka.com.cn/lgt/user_page/no-referrer',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
}
def write_articles(user_id,resp_json):
file_path = f'{basedir}/{user_id}_{resp_json["data"]["end_time"]-1}.json'
with open(file_path, 'w', encoding='utf-8') as fpw:
fpw.write(json.dumps(resp_json, ensure_ascii=False))
logger.info(">> %s", file_path)
def get_end_time(user_id):
article_ids = list()
end_time_max = 0
end_time_min = time.time() * 1000
end_time_origin = end_time_min
for name in os.listdir(f"{basedir}"):
if str(user_id) in name:
json_file = os.path.join(basedir, name)
with open(json_file, 'r', encoding='utf-8') as fpr:
resp = json.load(fpr)
temp = list(map(lambda it: it["info"]["id"], resp["data"]["contents"]))
logger.info("%s: %s", json_file, len(temp))
end_time_max = max(end_time_max, resp["data"]["end_time"])
end_time_min = min(end_time_min, resp["data"]["end_time"])
article_ids.extend(temp)
print(len(article_ids), len(set(article_ids)), end_time_max, end_time_min)
return end_time_min if abs(end_time_min - end_time_origin) > 2 else ''
for it in load_authors():
user_id = json.loads(it)["author"]["id"]
skip = False
for name in os.listdir(f"{basedir}"):
if str(user_id) in name:
logger.info("%s 已经采集", user_id)
skip = True
break
if skip:
continue
json_data = {
'user_id': user_id,
'end_time': get_end_time(user_id),
}
for i in range(1000):
try:
json_file = f'{basedir}/{json_data.get("end_time")}.json'
if os.path.exists(json_file):
logger.info("已经存在 %s", json_file)
with open(json_file, 'r', encoding='utf-8') as fpr:
json_data['end_time'] = json.load(fpr)["data"]["end_time"]
continue
response = requests.post(
'https://t.10jqka.com.cn/user_center/open/api/content/v2/get_by_uid',
headers=headers,
json=json_data,
)
resp_json = response.json()
if resp_json["data"] and not resp_json["data"].get("end_time"):
break
json_data['end_time'] = resp_json["data"]["end_time"]
write_articles(json_data['user_id'], resp_json)
time.sleep(2)
except:
logger.error(traceback.format_exc())
time.sleep(12)
In [ ]:
article_ids = list()
end_time_max = 0
end_time_min = 1691037791000
for name in os.listdir(f"{basedir}"):
json_file = os.path.join(basedir, name)
if str(user_id) in name:
with open(json_file, 'r', encoding='utf-8') as fpr:
resp = json.load(fpr)
temp = list(map(lambda it: it["info"]["id"], resp["data"]["contents"]))
logger.info("%s: %s", json_file, len(temp))
end_time_max = max(end_time_max, resp["data"]["end_time"])
end_time_min = min(end_time_min, resp["data"]["end_time"])
article_ids.extend(temp)
print(len(article_ids), len(set(article_ids)), end_time_max, end_time_min)
In [ ]:
len(article_ids)