一、同花顺(App)

1、先爬取同花顺首页推荐的所有用户的个人主页;划10000次翻页;

2、访问这些个人主页链接,通过个人主页--动态(只要图文)爬取贴子。

In [ ]:
from datetime import datetime

begin_date = "2023-08-01 04:00:00"
date = datetime.strptime(begin_date, '%Y-%m-%d %H:%M:%S')
formatted_date = date.strftime('%a, %d %b %Y %H:%M:%S GMT')
print('begin_date', begin_date, formatted_date)

import requests
import time
import os
import json

def parse_resp(resp_json):
    global authors,headers
    file_name = datetime.strptime(headers["If-Modified-Since"], '%a, %d %b %Y %H:%M:%S GMT')\
        .strftime('%Y-%m-%d_%H%M%S.json')
    with open(f'{basedir}/{file_name}', 'w', encoding='utf-8') as fpw:
        fpw.write(response.text)
    total = resp_json.get("total")
    pageItems = resp_json.get("pageItems")
    for item in pageItems:
        for info in item["combination"]:
            if "author" in info:
                author = json.dumps(info, ensure_ascii=False)
                if author not in authors:
                    authors.append(author)
                    print("="*30)
                    print("add author", author)
    print("authors len:", len(authors))

headers = {
    'Host': 'recommend.10jqka.com.cn',
    'Connection': 'Keep-Alive',
    # 'Accept-Encoding': 'gzip',
    'If-Modified-Since': formatted_date,
    'User-Agent': 'Hexin_Gphone/10.80.01 (Royal Flush) hxtheme/0 innerversion/G037.08.773.1.32 followPhoneSystemTheme/0 userid/-686312585 getHXAPPAccessibilityMode/0 hxNewFont/1 isVip/0 getHXAPPFontSetting/normal getHXAPPAdaptOldSetting/0 okhttp/3.14.9',
}

params = {
    'reqType': '4',
    'plat': 'g',
    'time': '0',
    'userid': '686312585',
    'version': 'G037.08.773',
    'gid': '1',
    'mo': '0',
    'm2': '',
    'm1a': '7ddac4a9d9185ba791e6f2c318fe03b8',
    'm1': '7031c737a36ed0a1',
    'm6': '',
    'm6a': '',
    'nn': '同花顺',
    'm5': '',
    'm0': '',
    'm0a': '',
}



response = requests.get('https://recommend.10jqka.com.cn/feed/api/v3/index', params=params, headers=headers)
next_date = response.headers["Date"]
parse_resp(response.json())


basedir = "msgs"
if not os.path.exists(basedir):
    os.makedirs(basedir)
    print("创建:", basedir)

def dump_authors():
    with open("authors.json", 'w', encoding='utf-8') as fpw:
        json.dump(list(authors),fpw)

def load_authors():
    if not os.path.exists("authors.json"):
        return []
    else:
        with open("authors.json", 'r', encoding='utf-8') as fpr:
            return json.load(fpr)
    
authors = load_authors()
print("*"*10, len(authors), '*'*10)

def dump_to_file(path, content):
    with open(path, 'w', encoding='utf-8') as fpw:
        fpw.write(content)

def load_from_file(path):
    if not os.path.exists(path):
        return None
    else:
        with open(path, 'r', encoding='utf-8') as fpr:
            return fpr.read().strip()
In [ ]:
import traceback
print("*"*10, 'len(authors):', len(authors), '*'*10)
for _ in range(100):
    try:
        print('next_date', next_date)
        headers["If-Modified-Since"] = next_date or load_from_file("next_date")
        assert headers["If-Modified-Since"] is not None
        response = requests.get('https://recommend.10jqka.com.cn/feed/api/v3/index', params=params, headers=headers)
        response.raise_for_status()
        resp_json = response.json()
        parse_resp(resp_json)
        next_date = response.headers["Date"]
        if len(authors) % 5==0:
            dump_authors()
        time.sleep(5.5)
        dump_to_file("next_date", next_date)
    except:
        traceback.print_exc()
        time.sleep(15.5)

dump_authors()

获取个人动态信息¶

In [ ]:
import requests
import logging
from logging import handlers
import sys
import json
import time
import traceback

if not os.path.exists('d:/logs'):
    print("创建: ", 'd:/logs')
    os.mkdir('d:/logs')

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)

fh = handlers.TimedRotatingFileHandler(filename=f"d:/logs/tonghuashun.log", when='midnight', interval=1, backupCount=5)
fh.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
handlers = [fh, ch]

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] [%(module)s-%(threadName)s]  %(lineno)d - %(message)s',
    handlers=handlers
)



logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


basedir = "articles"
if not os.path.exists(basedir):
    os.makedirs(basedir)
    logger.info("创建:%s", basedir)
    

def load_authors():
    if not os.path.exists("authors.json"):
        return []
    else:
        with open("authors.json", 'r', encoding='utf-8') as fpr:
            return json.load(fpr)
In [ ]:
headers = {
    'Host': 't.10jqka.com.cn',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'hexin-v': 'A9JDyq714qSGTx5Yc3nrOLZ3IpO049Z9COfKoZwr_gVwr30JhHMmjdh3GrZv',
    'User-Agent': 'Mozilla/5.0 (Linux; Android 13; 22081212C Build/TKQ1.220829.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/104.0.5112.97 Mobile Safari/537.36 Hexin_Gphone/10.83.03 (Royal Flush) hxtheme/0 innerversion/G037.08.791.1.32 followPhoneSystemTheme/0 userid/686221341 getHXAPPAccessibilityMode/0 hxNewFont/1 isVip/1 getHXAPPFontSetting/normal getHXAPPAdaptOldSetting/0',
    'Content-Type': 'application/json',
    'Accept': '*/*',
    'Origin': 'https://t.10jqka.com.cn',
    'X-Requested-With': 'com.hexin.plat.android',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Dest': 'empty',
    'Referer': 'https://t.10jqka.com.cn/lgt/user_page/no-referrer',
    'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
}


def write_articles(user_id,resp_json):
    file_path = f'{basedir}/{user_id}_{resp_json["data"]["end_time"]-1}.json'
    with open(file_path, 'w', encoding='utf-8') as fpw:
        fpw.write(json.dumps(resp_json, ensure_ascii=False))
        logger.info(">> %s", file_path)

def get_end_time(user_id):
    article_ids = list()
    end_time_max = 0
    end_time_min = time.time() * 1000
    end_time_origin = end_time_min
    for name in os.listdir(f"{basedir}"):
        if str(user_id) in name:
            json_file =  os.path.join(basedir, name)
            with open(json_file, 'r', encoding='utf-8') as fpr:
                resp = json.load(fpr)
                temp = list(map(lambda it: it["info"]["id"], resp["data"]["contents"]))
                logger.info("%s: %s", json_file, len(temp))
            end_time_max = max(end_time_max, resp["data"]["end_time"])    
            end_time_min = min(end_time_min, resp["data"]["end_time"])    
            article_ids.extend(temp)
    print(len(article_ids), len(set(article_ids)), end_time_max, end_time_min)
    return end_time_min if abs(end_time_min - end_time_origin) > 2 else ''

for it in load_authors():
    user_id = json.loads(it)["author"]["id"]
    skip = False
    for name in os.listdir(f"{basedir}"):
        if str(user_id) in name:
            logger.info("%s 已经采集", user_id)
            skip = True
            break
    if skip:
        continue
    json_data = {
        'user_id': user_id,
        'end_time': get_end_time(user_id),
    }

    for i in range(1000):
        try:
            json_file = f'{basedir}/{json_data.get("end_time")}.json'
            if os.path.exists(json_file):
                logger.info("已经存在 %s", json_file)
                with open(json_file, 'r', encoding='utf-8') as fpr:
                    json_data['end_time'] = json.load(fpr)["data"]["end_time"]
                continue
            response = requests.post(
                'https://t.10jqka.com.cn/user_center/open/api/content/v2/get_by_uid',
                headers=headers,
                json=json_data,
            )
            resp_json = response.json()
            if resp_json["data"] and not resp_json["data"].get("end_time"):
                break
            json_data['end_time'] = resp_json["data"]["end_time"]

            write_articles(json_data['user_id'], resp_json)
            time.sleep(2)
        except:
            logger.error(traceback.format_exc())
            time.sleep(12)
In [ ]:
article_ids = list()
end_time_max = 0
end_time_min = 1691037791000
for name in os.listdir(f"{basedir}"):
    json_file =  os.path.join(basedir, name)
    if str(user_id) in name:
        with open(json_file, 'r', encoding='utf-8') as fpr:
            resp = json.load(fpr)
            temp = list(map(lambda it: it["info"]["id"], resp["data"]["contents"]))
            logger.info("%s: %s", json_file, len(temp))
        end_time_max = max(end_time_max, resp["data"]["end_time"])    
        end_time_min = min(end_time_min, resp["data"]["end_time"])    
        article_ids.extend(temp)
print(len(article_ids), len(set(article_ids)), end_time_max, end_time_min)
In [ ]:
len(article_ids)