In [17]:
from urllib.parse import urlparse

url = "http://gu.qq.com/resources/shy/news/detail-v2/index.html#/?id=nesSN2023071420052083c2f2b2&s=b"

parsed_url = urlparse(url)
domain = parsed_url.netloc
print(domain)
gu.qq.com
In [3]:
import concurrent.futures
import time
import threading

# 定义一个任务函数
def task(n):
    print("dsadsa", threading.current_thread())
    time.sleep(1)
    return n * n

# 创建ThreadPoolExecutor对象
with concurrent.futures.ThreadPoolExecutor(max_workers=3, thread_name_prefix="das_") as executor:
    # 提交任务到线程池
    future1 = executor.submit(task, 2)
    future2 = executor.submit(task, 3)
    future2 = executor.submit(task, 3)
    future2 = executor.submit(task, 3)
    future2 = executor.submit(task, 3)
    future2 = executor.submit(task, 3)
    future2 = executor.submit(task, 3)
    future2 = executor.submit(task, 3)

    # 获取任务的结果
    result1 = future1.result()
    result2 = future2.result()

    # 打印结果
    print(result1)  # 输出: 4
    print(result2)  # 输出: 9
dsadsa <Thread(das__0, started 16208)>
dsadsa <Thread(das__1, started 14688)>
dsadsa <Thread(das__2, started 17576)>
dsadsa <Thread(das__1, started 14688)>
dsadsa <Thread(das__2, started 17576)>
dsadsa <Thread(das__0, started 16208)>
dsadsadsadsa <Thread(das__2, started 17576)>
 <Thread(das__0, started 16208)>
4
9
In [ ]:
import requests

cookies = {
    'SINAGLOBAL': '2261485068514.264.1626428099137',
    'ULV': '1692006229115:37:3:1:1148142812924.0146.1692006229077:1691117572444',
    'UOR': ',,login.sina.com.cn',
    'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5VRXLBxvm1laUTPLHenpbs5NHD95QcSKe0SK57ehqfWs4Dqcj.i--RiK.7i-i2i--RiKyFi-zpi--ciKn7iKn7i--Ni-2NiKyF',
    'ALF': '1722590781',
    'SUB': '_2A25J3Yf_DeRhGeBL6FEU8C7MzTmIHXVrISm3rDV8PUJbkNANLWnhkW1NRzcWkHHwIXrEdDq3QceIwcYwkDxN-NnI',
    '_s_tentry': 'weibo.com',
    'Apache': '1148142812924.0146.1692006229077',
    'PC_TOKEN': '8dbdce4904',
    'WBtopGlobal_register_version': '2023081417',
    'crossidccode': 'CODE-gz-1Qvu80-3iPcJA-rzF1PtHF7GPWtcU02bda6',
    'SSOLoginState': '1692006320',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    # 'Cookie': 'SINAGLOBAL=2261485068514.264.1626428099137; ULV=1692006229115:37:3:1:1148142812924.0146.1692006229077:1691117572444; UOR=,,login.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5VRXLBxvm1laUTPLHenpbs5NHD95QcSKe0SK57ehqfWs4Dqcj.i--RiK.7i-i2i--RiKyFi-zpi--ciKn7iKn7i--Ni-2NiKyF; ALF=1722590781; SUB=_2A25J3Yf_DeRhGeBL6FEU8C7MzTmIHXVrISm3rDV8PUJbkNANLWnhkW1NRzcWkHHwIXrEdDq3QceIwcYwkDxN-NnI; _s_tentry=weibo.com; Apache=1148142812924.0146.1692006229077; PC_TOKEN=8dbdce4904; WBtopGlobal_register_version=2023081417; crossidccode=CODE-gz-1Qvu80-3iPcJA-rzF1PtHF7GPWtcU02bda6; SSOLoginState=1692006320',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    # Requests doesn't support trailers
    # 'TE': 'trailers',
}

response = requests.get('https://s.weibo.com/weibo?q=东方财富&page=3', cookies=cookies, headers=headers)
response.text
In [10]:
import requests

cookies = {
    'SINAGLOBAL': '2261485068514.264.1626428099137',
    'ULV': '1692006229115:37:3:1:1148142812924.0146.1692006229077:1691117572444',
    'UOR': ',,login.sina.com.cn',
    'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5VRXLBxvm1laUTPLHenpbs5NHD95QcSKe0SK57ehqfWs4Dqcj.i--RiK.7i-i2i--RiKyFi-zpi--ciKn7iKn7i--Ni-2NiKyF',
    'ALF': '1722590781',
    'SUB': '_2A25J3Yf_DeRhGeBL6FEU8C7MzTmIHXVrISm3rDV8PUJbkNANLWnhkW1NRzcWkHHwIXrEdDq3QceIwcYwkDxN-NnI',
    'XSRF-TOKEN': 'Gfu7kLeYnWWCrzbXyEi0LHHd',
    'WBPSESS': 'Dt2hbAUaXfkVprjyrAZT_PQ8_D4nB_y8rj3cmrBqKQeK6J8zFU4D8RWASJTXMzAShVAr3fkzG3_LdjLvKN4Pb7lOrND6TycPdX9rAZuCZcwB7iiWXAnKVPgoYKp7OCdWSIVOHYi_VfJbBv5emvkuWH7vjWFIuy-PNlq2_9yTFgbRqC9XT54q22N8zjZTAk8eImwO1JcZb1Z4NDcTV55j8g==',
    '_s_tentry': 'weibo.com',
    'Apache': '1148142812924.0146.1692006229077',
    'PC_TOKEN': '8dbdce4904',
    'crossidccode': 'CODE-gz-1Qvu80-3iPcJA-rzF1PtHF7GPWtcU02bda6',
    'SSOLoginState': '1692006320',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Origin': 'https://s.weibo.com',
    'Connection': 'keep-alive',
    'Referer': 'https://s.weibo.com/',
    # 'Cookie': 'SINAGLOBAL=2261485068514.264.1626428099137; ULV=1692006229115:37:3:1:1148142812924.0146.1692006229077:1691117572444; UOR=,,login.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5VRXLBxvm1laUTPLHenpbs5NHD95QcSKe0SK57ehqfWs4Dqcj.i--RiK.7i-i2i--RiKyFi-zpi--ciKn7iKn7i--Ni-2NiKyF; ALF=1722590781; SUB=_2A25J3Yf_DeRhGeBL6FEU8C7MzTmIHXVrISm3rDV8PUJbkNANLWnhkW1NRzcWkHHwIXrEdDq3QceIwcYwkDxN-NnI; XSRF-TOKEN=Gfu7kLeYnWWCrzbXyEi0LHHd; WBPSESS=Dt2hbAUaXfkVprjyrAZT_PQ8_D4nB_y8rj3cmrBqKQeK6J8zFU4D8RWASJTXMzAShVAr3fkzG3_LdjLvKN4Pb7lOrND6TycPdX9rAZuCZcwB7iiWXAnKVPgoYKp7OCdWSIVOHYi_VfJbBv5emvkuWH7vjWFIuy-PNlq2_9yTFgbRqC9XT54q22N8zjZTAk8eImwO1JcZb1Z4NDcTV55j8g==; _s_tentry=weibo.com; Apache=1148142812924.0146.1692006229077; PC_TOKEN=8dbdce4904; crossidccode=CODE-gz-1Qvu80-3iPcJA-rzF1PtHF7GPWtcU02bda6; SSOLoginState=1692006320',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    # Requests doesn't support trailers
    # 'TE': 'trailers',
}

response = requests.get('https://weibo.com/ajax/side/search?q=东方财富', cookies=cookies, headers=headers)
In [5]:
import os

count = 0
basedir = r'D:\WORK\UCP\UGC'
for name in os.listdir(basedir):
    if name.startswith('article_downloads'):
        for file_name in os.listdir(os.path.join(basedir, name)):
            if 'live' not in file_name and 'video' not in file_name:
                count += 1
count
Out[5]:
722772
In [4]:
text='''- [] 145 X 124389
- [x] 146 X 126581
- [x] 147 X 136277
- [x] 148 X 165695
- [x] 149 X 89936
- [x] 150  X 259940
- [x] 151 X 74541  
- [x] 152 X 162546
- [x] 153 X 516374
- [x] 154 X  173424
- [x] 155 X 88110
- [x] 156 X  135789
- [x] 157 X 164889   
- [x] 158 X 118319 已换Cookie 
- [x] 159 X  178240
- [x] 160 X  295349
- [x] 161 X 153842
- [x] 162  X 132281
- [x] 163  X 151145  153的 分配到这里
- [x] 164  X 244659 153的 分配到这里
'''.strip()

data= list(map(lambda it:it.strip().split()[4], text.split("\n")))
print(data)
sum(map(int, data))
['124389', '126581', '136277', '165695', '89936', '259940', '74541', '162546', '516374', '173424', '88110', '135789', '164889', '118319', '178240', '295349', '153842', '132281', '151145', '244659']
Out[4]:
3492326
In [ ]:
import json 

with open(r'\\172.31.227.176\jrtt\科技\2017-08-10\中关村在线\6452603443243647501.json', 'r', encoding='GBK') as fpr:
    data = json.load(fpr)
data
In [13]:
tt='''
时间  10.150.108.212(file_count,total_size)  10.150.108.213(file_count,total_size)  10.150.108.220(file_count,total_size)
20230810 13830,25G 12894,24G 13172,24G
20230811 13395,25G 13331,25G 13721,25G
20230812 10104,19G 10475,20G 10463,20G
20230813 13215,30G 12896,29G 12858,29G
20230814 12734,28G 13618,29G 13457,29G
20230815 13429,35G 13386,35G 13872,36G
20230816 10909,33G 10610,32G 9969,30G
20230817 9690,31G 9157,29G 9145,29G
'''.strip().replace(',', ":").split("\n")
In [6]:
import pandas as pd
In [14]:
pd.DataFrame(list(map(str.split, tt)))
Out[14]:
0 1 2 3
0 时间 10.150.108.212(file_count:total_size) 10.150.108.213(file_count:total_size) 10.150.108.220(file_count:total_size)
1 20230810 13830:25G 12894:24G 13172:24G
2 20230811 13395:25G 13331:25G 13721:25G
3 20230812 10104:19G 10475:20G 10463:20G
4 20230813 13215:30G 12896:29G 12858:29G
5 20230814 12734:28G 13618:29G 13457:29G
6 20230815 13429:35G 13386:35G 13872:36G
7 20230816 10909:33G 10610:32G 9969:30G
8 20230817 9690:31G 9157:29G 9145:29G
In [ ]:
i
BACKUP_DIR="/vdb/output"

delete_backup(){
    local dir="$1"

    echo "enter $1"
    # 遍历文件夹下的所有文件和子文件夹
    for file in "$dir"/*; do
        if [[ -d "$file" && "$file" != *baike.baidu* ]]; then
            # 递归调用函数处理子文件夹
            echo "delete $file"
            rm -rf "$file"
        else
            echo "$file 不符合条件"
        fi
    done
}

delete_backup $BACKUP_DIR
In [ ]:
import redis

import random

pool = redis.ConnectionPool(host='10.205.123.174', port=6379, decode_responses=True)
redis_cli = redis.Redis(host='10.205.123.174', port=6379, password='Abc@1234', decode_responses=True, db=4)

userIds = redis_cli.smembers("xueqiu_post_userid_set_153")

for userId in userIds:
    if '4642157440' != userId:
        redis_cli.sadd("xueqiu_post_userid_set_" + random.choice(('163', '164')))
    else:
        print("get", 4642157440)
In [3]:
import redis

redis_cli = redis.Redis(host='172.31.227.172', port=6379, db=0, decode_responses=True)
try:
    jrtt_author_hash = redis_cli.hgetall('lds:jrtt_author_hash')
except:
    redis_cli.close()
In [4]:
jrtt_author_hash
Out[4]:
{'涓鍥界數瀛愭姤': '350',
 '科协频道': '109',
 '正解局': '349',
 '明日电子数码': '78',
 '游民星空': '93',
 '36姘': '422',
 '安徽发布': '177',
 '妍妹说财经': '56',
 '环球科学': '148',
 '科学怂叔': '494',
 '浜烘皯缃': '385',
 '成都科普': '406',
 '子舒说': '57',
 '南方周末': '185',
 '萌格格日记': '434',
 '钛媒体APP': '190',
 '36氪': '3',
 '类星频道': '446',
 '龙哥唠硬嗑': '524',
 '数码小行家': '71',
 '锋潮评测': '328',
 '银河规划局': '136',
 '中国铁路': '23',
 '台海网': '44',
 '装修侠': '283',
 '大众日报': '51',
 '职场大白白': '116',
 '电动车小行家': '103',
 '年代': '462',
 '深圳新闻网': '206',
 '向知平说': '238',
 '侣行': '145',
 '创业者李孟': '348',
 '鍗婁袱璐㈢粡': '395',
 '科学网': '110',
 '远方青木': '129',
 '电气设计狄老师': '353',
 '建筑结构杂志社': '448',
 '你有小星星嘛': '465',
 '老孟解读': '427',
 '文汇网': '73',
 '老毛桃winpe': '246',
 '临淄融媒': '25',
 '新音响杂志': '76',
 '爵士浩南': '411',
 '地球味儿': '146',
 '千际咨询': '39',
 '甘肃农村广播': '377',
 '懂车帝视线': '66',
 '经典语录': '222',
 '乐居生活官': '301',
 '风吟枪啸': '285',
 '知知贵阳': '334',
 '鍗楁柟鍛ㄦ湯': '340',
 '界面快讯': '105',
 'Mac圈': '444',
 '中国科普博览': '182',
 '邦乔威与锋子呀': '473',
 '安静的记录生活感悟': '445',
 '设计圈子': '447',
 '地理科普局': '504',
 '内蒙古晨报': '212',
 '中国科技信息': '19',
 '数码参考': '239',
 'MZDN望世界': '262',
 '通信新仔': '451',
 '财经杂志': '125',
 '海外网': '214',
 '微笑fly': '235',
 '中国汽车报': '16',
 '中核集团': '24',
 '芒果宝贝2008': '486',
 '太原广播电视台经广': '493',
 'heart科技派': '5',
 '琅公': '487',
 '南方日报': '291',
 '小白测评': '202',
 '梁满玥': '84',
 '知识嗑儿': '155',
 '大象新闻客户端': '52',
 '水木视讯': '87',
 '大哥大杂谈': '426',
 '集集屋': '139',
 '鐢靛晢鍦ㄧ嚎': '378',
 '笑哥地图': '482',
 '陕西网': '429',
 '榛勭煶缃戣': '373',
 '快乐de东东爱旅行': '292',
 '海达郝老师': '91',
 '易观Analysys': '436',
 '探秘历史': '294',
 '神秘园': '512',
 '观察者网独家视频': '466',
 '雄观天下': '441',
 '森罗万象原创': '516',
 '入江闪闪': '275',
 '中关村在线': '15',
 '果粉易查': '476',
 '品茗阅法': '47',
 '星火彪WarOH': '413',
 '新浪教育': '215',
 '昔日数码': '209',
 '高谈数码': '142',
 '华夏时报': '41',
 '中经海南': '485',
 '维达说': '258',
 'opendotnet': '470',
 '宏哥谈商论道': '61',
 '文汇报': '226',
 '航空知识王亚男': '278',
 '安卓中国': '60',
 '水利天下薛国强': '483',
 '自由骆驼的厨房': '163',
 '中国电子报': '17',
 '亿玛信诺': '306',
 '搞机的小黄': '68',
 '笔吧评测室': '112',
 '93913虚拟现实': '410',
 '科技小辛': '509',
 '宅家凡人': '58',
 '澎湃新闻': '96',
 '中国科学报': '18',
 '城管发布': '368',
 '笔杆先生': '113',
 '极客小冷': '165',
 '鍘﹂棬鏃ユ姤绀': '342',
 '美居研习社': '296',
 '青橙融媒': '141',
 'DoNews': '178',
 '投影时代': '186',
 '今日头条': '33',
 '欧文酥': '304',
 '中国台湾网': '236',
 '大老张叨叨': '302',
 '新智元': '194',
 '潘扬老师': '94',
 '装修小超人': '121',
 '零玖UI设计': '140',
 '帆雨动画': '287',
 '人民数据': '248',
 '阿星机械': '375',
 '21世纪经济报道': '2',
 '老杜侃江山': '467',
 '三言科技Pro': '11',
 '央视网': '54',
 '什么值得买': '183',
 '正经的烧杯': '85',
 '观察者网': '205',
 '脑科学世界': '118',
 '久巨自动化设备': '161',
 '雷电BlueCube': '407',
 '小央视频': '449',
 '环球时报': '221',
 '海漂看世界': '457',
 '璐㈢粡缃': '415',
 '中国科协': '181',
 '波普兄弟': '89',
 '米奇走了': '114',
 'DV现场': '394',
 '快消八谈': '259',
 '电气知识课堂': '442',
 '中国网资讯': '22',
 '闀垮煄缃': '331',
 '玩机生活': '174',
 '机械之名': '397',
 '拾遗笔记': '459',
 '蓝大仙人': '519',
 '斋晗': '74',
 '地球记': '498',
 'A没完没了': '489',
 '数据派THU': '70',
 '劳动报': '341',
 '破土工作室': '220',
 '杨老师高考志愿研究': '263',
 '汉新闻': '439',
 '华西都市报': '223',
 '创客迷': '37',
 '图哥捡垃圾': '346',
 '鑽嗗窞鏂伴椈缃': '352',
 '澳洲财经见闻': '389',
 '啊涛科普': '48',
 '半个机长': '152',
 '企查查财经': '423',
 '北斗星豆': '370',
 '瞭望智库': '107',
 '财经网': '126',
 '掌上威海': '224',
 '星曜小蘑菇': '387',
 '四川人在路上': '244',
 '探客社': '67',
 '极客尚老师': '156',
 '老刘玩数码': '314',
 '城市王朝': '204',
 'DeepTech深科技': '192',
 '超能梦想实验室': '274',
 '涓鍏虫潙鍦ㄧ嚎': '147',
 '爱读书小王': '98',
 'HIFI音乐试听': '6',
 '八零波波': '501',
 '超能网': '196',
 '京东云开发者': '29',
 '远瞻智库': '130',
 '界面新闻': '237',
 '赛雷三分钟': '381',
 '硬件是一门学问': '456',
 '晓说通信': '81',
 'Atstudy网校': '420',
 '住小帮APP': '463',
 '芯智讯': '357',
 '东方网': '227',
 '科技铭程': '317',
 '工程宝典': '305',
 '船长科普记': '523',
 '智东西': '388',
 '科学大魔王': '520',
 '科普足迹': '322',
 '山西信息港': '229',
 '金昌发布': '189',
 '苟胜老师': '495',
 '职场老炮儿': '318',
 '云间生活': '27',
 '遇二白': '240',
 '星灿其里': '79',
 '钛科普': '461',
 '搜芯易': '430',
 '制造原理': '245',
 '上进SCI快发': '471',
 '二更': '26',
 '系统迷': '484',
 '央视新闻': '53',
 '小王深挖历史': '63',
 '山西社会与法治': '167',
 '机器人欢喜哥': '195',
 '模具设计壹杨老师': '402',
 'IT时报': '7',
 '互联心声': '28',
 '莎谈史记': '289',
 '南通广播电视台': '42',
 '环球科学猫': '172',
 '夏冰雹频道': '256',
 '新天水网': '455',
 '观研天下': '175',
 '快科技': '193',
 '小白Smooth': '267',
 '幻尘科技': '65',
 '华商韬略': '184',
 '鍗楁柟閮藉競鎶': '327',
 'APPLE频道': '4',
 '百姓追剧': '233',
 '瑶家光哥': '288',
 '大表哥科技汇': '480',
 'VR陀螺': '356',
 '浜烘皯鏃ユ姤': '418',
 '中国青年网': '213',
 '冷月星辰1975': '300',
 '爆侃家电圈': '474',
 '两晨科技': '14',
 'LUO51888': '293',
 '陇南武都发布': '384',
 '狂丸研究所': '99',
 '你不知道的事1236S6G': '453',
 '虚拟子弹': '120',
 '冯月月': '273',
 '冷科普': '36',
 '华硕之声': '369',
 '人民日报': '31',
 '1分钟妙招小窍门': '1',
 '收音机评论译介': '69',
 '世界科学': '13',
 '同福编程': '477',
 '人民网': '32',
 '启农说': '46',
 '中国建材报': '419',
 'Bensir本色说': '401',
 '朱知一二': '491',
 '钱江晚报': '135',
 '行迹旅途': '518',
 '科技有意思': '403',
 '长沙晚报掌上长沙': '138',
 '厂长评测': '316',
 '安兔兔': '59',
 '引力播新闻': '383',
 '黄石网警': '201',
 '有点机事': '82',
 '光明日报': '35',
 '鍗庡忔椂鎶': '363',
 '天逸音响': '265',
 '昆明应急': '77',
 '凤凰网': '216',
 '机器之心Pro': '297',
 '闈掓欒瀺濯': '325',
 'CGTN': '169',
 '济南市文化和旅游局': '90',
 '十七年平面UI现场教学': '280',
 '情兽先声': '269',
 '老廉颇': '428',
 '耳机林sir': '250',
 '微机分WekiHome': '218',
 '邮电设计技术': '404',
 '婢庢箖鏂伴椈': '416',
 '职场办公技能': '115',
 '轲子体': '128',
 '自由舒民': '500',
 '布丁评测': '210',
 '经典传奇': '490',
 'D哥说路网': '479',
 '浠涔堝煎緱涔': '354',
 '我真的是王二狗': '276',
 '姘戣埅璧勬簮缃': '330',
 '青蜂侠': '241',
 '水大叔': '438',
 '四川在线': '217',
 '三日说3C': '10',
 '巅峰高地': '64',
 '车事纪': '127',
 'Yoko视频工作室': '298',
 '张捷财经观察': '351',
 '云上新闻': '502',
 'TOM0818': '521',
 '健康时报': '34',
 '吉报调查': '45',
 '荆州新闻网': '119',
 'XCMG徐工起重': '510',
 '青年文摘': '230',
 '淘数码': '270',
 '瑙傚療鑰呯綉': '326',
 '瑙傜爺澶╀笅': '333',
 '小伊评科技': '464',
 '鉴史驿站': '134',
 '乐活e派': '525',
 '稀饭数码': '412',
 '涓鍥界綉鏁欒偛': '362',
 '娣卞湷鏂伴椈缃': '344',
 '毒舌财经': '249',
 '硬核拆解': '271',
 '小天说数码': '472',
 '小仙女大影迷': '62',
 '中国信息化周报': '157',
 '医食参考': '199',
 '厦门日报社': '176',
 '徐德文科学频道': '180',
 '简科技简哥': '254',
 '卡哥读史': '503',
 '浙江日报': '522',
 '特费心特斯拉': '272',
 '国资小新': '50',
 '猫妈爱旅行': '247',
 '神奇的数字创作者': '108',
 '数字贵州新闻': '284',
 '河南商报': '88',
 '历史时光鉴': '312',
 '涓夎█绉戞妧Pro': '329',
 '砺石商业评论': '164',
 '南京新闻': '279',
 '花朵财经': '158',
 '科技志': '460',
 '第一心理': '431',
 '大熊美食生活百科': '391',
 '中国经济周刊': '20',
 '装修个家': '153',
 '山西老邸': '323',
 '环球网': '101',
 '火星探长': '497',
 '新湖南': '75',
 '醉挽清风浑不觉': '307',
 '科普六点半': '506',
 '布衣小童': '243',
 '金属加工': '408',
 '久叔说科技': '450',
 '爆料观察家': '97',
 '人民政协网': '160',
 'YLTCK': '299',
 '酷玩实验室Coollabs': '242',
 '新鲜科视': '374',
 '京报网': '30',
 '映维Nweon': '80',
 '陆新之谈商业': '166',
 '新故事学堂官方': '231',
 '大卫聊科技': '399',
 '宗熙先生': '437',
 '君呼': '234',
 '卢克文工作室': '43',
 '谈史录': '124',
 '丁卯读写悟': '9',
 '半两财经': '40',
 '宋也在学PLC': '303',
 '民航资源网': '86',
 '潮新闻': '95',
 '一水遮夏': '260',
 '隔夜也很宅': '358',
 '尖端防务': '468',
 '瀹夊厰鍏': '332',
 '科学颜习社': '111',
 '科技拌饭': '208',
 '讲广普的YEX': '469',
 '电脑维修狮': '104',
 '看历史浮沉': '106',
 '我愛制造': '517',
 '环球时报新媒体': '345',
 '通信信息报': '390',
 '瓦力评测': '171',
 '大科技杂志社': '365',
 '科技V力': '364',
 '布衣粗食': '203',
 '科技一休哥': '393',
 '百科密码': '421',
 '赣州消防': '232',
 '传感器专家网': '405',
 '国家长江中心': '49',
 '上观新闻': '12',
 '兰台': '151',
 'HitMag': '320',
 '无知君': '315',
 '中国纺织杂志': '380',
 '鍑ゅ嚢鍗瑙': '343',
 '码农翻身': '398',
 '新缸中之脑': '452',
 '绿头怪科技': '417',
 '扶苏聊历史': '515',
 '股视大数据': '117',
 '胖刘老师智玩汇': '168',
 '大白菜装机': '252',
 '女王?陛下': '55',
 '黑猫科普': '496',
 '指文图书': '159',
 '乐晴行业观察': '367',
 '一条': '409',
 '蜀乡小果': '290',
 '李木子频道': '347',
 '小渊评宅': '200',
 '魔改工坊': '277',
 '较高端人类官方号': '514',
 '柳先说': '170',
 '脑洞爱好者': '188',
 '李大霄': '458',
 '消费文摘': '324',
 '科学火箭叔': '319',
 '大海实验室': '321',
 '旺仔甜妹妹8413': '144',
 '阿明观察': '339',
 '龙A爱学习': '143',
 '科学参考': '150',
 '双流机场飞机拍客': '311',
 '金芒美食生活小妙招': '133',
 '暨南大学': '359',
 '每天精选': '228',
 '三维地图看世界': '481',
 '热点科技': '361',
 '凤凰卫视': '179',
 '达梦数据库': '432',
 'monkeyzmk': '286',
 '香江城堡': '268',
 '中国三峡集团': '392',
 '梁叔谈财经': '83',
 '科技数码秀': '371',
 '齐点淄博': '478',
 '发光的二极管yoyo': '149',
 '圆桌派': '313',
 '电动湃': '173',
 '数码产品中毒患者': '281',
 '速看视讯': '131',
 '泉州网': '211',
 '广东公安': '379',
 '他山石话': '253',
 '河口发布': '424',
 '长城网': '191',
 'PChome电脑之家': '8',
 '消费者报道': '92',
 '中国网龙城视窗': '219',
 '每日经济新闻': '507',
 '中国网教育': '21',
 '央视财经': '337',
 '鍏呯數澶寸綉': '433',
 '装修美少女': '122',
 '科技发明秀': '508',
 'MIUI发烧友俱乐部': '308',
 '毛小毛数码': '513',
 '材料material': '360',
 '海岛椰民': '499',
 '现代家电': '102',
 '南方都市报': '251',
 '卢大头': '282',
 '瀹夊崜涓鍥': '376',
 '绿色中国': '435',
 '福州新闻网': '225',
 '边码故事': '511',
 '槽逻辑': '261',
 '萌尬科技': '505',
 '纵相新闻': '366',
 '震安科技': '266',
 '数码科技兄弟': '72',
 '充电头网': '197',
 '机械工程文萃': '443',
 '电商在线': '257',
 '果核剥壳': '414',
 '括苍烂柯': '310',
 '瑁呬慨渚': '425',
 '怪罗科普': '255',
 '三易生活': '336',
 '湖北消防': '382',
 '鐬鏈涙櫤搴': '400',
 '今说新语': '492',
 '设计汇DesignHub': '123',
 '家居杂坛': '454',
 '北京大学第三医院': '207',
 '转转App': '338',
 '猫爷的渔场': '100',
 '好强强锅锅': '488',
 '21世纪商业评论': '335',
 '集微网': '355',
 '十万个品牌故事': '38',
 '都市快报橙柿互动': '132',
 '字节范儿': '162',
 '巨潮WAVE': '475',
 '家有影院与HiFi': '154',
 '爱搞机': '187',
 '新疆兵团广播电视台': '198',
 '闪魔SmartDevil': '396',
 '超哥解历史': '264',
 '北京市科学技术研究院': '440',
 '涓鍥界粡娴庡懆鍒': '372',
 'CCFvoice': '386',
 '甘肃新闻网': '295',
 '太阳伞述教': '309',
 '长江日报': '137'}
In [1]:
import redis

redis_cli = redis.Redis(host='172.31.227.172', port=6379, db=0, decode_responses=True)

src_key = 'jrtt_kj_article_1'
to_key = 'jrtt_kj_article_'
Out[1]:
30891
In [5]:
all_fields = redis_cli.hkeys(src_key)

len(all_fields)
Out[5]:
30880
In [3]:
(291922 + 69645 + 16703+139214+107220) / 5
Out[3]:
124940.8
In [6]:
# pipeline = redis_cli.pipeline()

# for i in range(7_501, 15_000):
#     field = all_fields[i]
#     val = redis_cli.hget(src_key, field)
#     print(i, field, val, end='\r')
#     pipeline.hset(to_key[:-1], field, val)
#     pipeline.hdel(src_key, field)

# pipeline.execute()

pipeline = redis_cli.pipeline()

# for i in range(0, 7_500):
#     field = all_fields[i]
#     val = redis_cli.hget(src_key, field)
#     print(i, field, val, end='\r')
#     pipeline.hset(to_key+'1', field, val)
#     pipeline.hdel(src_key, field)

# pipeline.execute()

# pipeline = redis_cli.pipeline()

# for i in range(6_0001, 17_0000):
#     field = all_fields[i]
#     val = redis_cli.hget(src_key, field)
#     print(i, field, val, end='\r')
#     pipeline.hset(to_key+'2', field, val)
#     pipeline.hdel(src_key, field)

# pipeline.execute()

# pipeline = redis_cli.pipeline()

# for i in range(17_0001, 60_0000):
#     field = all_fields[i]
#     val = redis_cli.hget(src_key, field)
#     print(i, field, val, end='\r')
#     pipeline.hset(to_key+'3', field, val)
#     pipeline.hdel(src_key, field)

# pipeline.execute()

pipeline = redis_cli.pipeline()

for i in range(7_501, 19_000):
    field = all_fields[i]
    val = redis_cli.hget(src_key, field)
    print(i, field, val, end='\r')
    pipeline.hset(to_key+'4', field, val)
    pipeline.hdel(src_key, field)

pipeline.execute()
redis_cli.close()
18999 6239153967582937602 {'title': 'Apple Watch去年出货880万块 前景存疑', 'author': '中关村在线', 'cteateTime': '2016-01-13 13:33:58', 'postUrl': 'http://m.zol.com.cn/article/5634663.html?tuiguangid=toutiao', 'attitudes_count': 0}: 0}d591e1ad', 'attitudes_count': 0}itudes_count': 0}s_count': 2}Mdb2VAgLBYaKy1pC6OnAy5yiGTlZwyd*nidwsQJb8QqInof6aCfsNT*xukRTcHkHdP0JowL6UXMJmDgvLRiHl-JXh3ffoMY=', 'attitudes_count': 0}_count': 0}: 0}
In [5]:
redis_cli.close()
In [1]:
import os
import time
import shutil

count = 0
ind = 0
dest = fr'D:\WORK\UCP\UGC\article_downloads_{ind}'
if not os.path.exists(dest):
    os.makedirs(dest)

for dirpath, dirnames, filenames in os.walk(r'D:\WORK\UCP\UGC\article_downloads'):
    for name in filenames:
        if 'live' not in name and 'video' not in name:
            shutil.move(os.path.join(dirpath, name), os.path.join(dest, name))
            count += 1
            
            if count == 10_0000:
                count = 0
                ind += 1
                dest =  fr'D:\WORK\UCP\UGC\article_downloads_{ind}'
                print(count, end='\r')
                time.sleep(0.001)
                if not os.path.exists(dest):
                    os.makedirs(dest)
0
In [ ]:
import zipfile
import os
import socket

hostname = socket.gethostname()
ip_address = socket.gethostbyname(hostname)

file_dir =  r'D:\WORK\UCP\UGC'
os.chdir(file_dir)
file_extension = "bib"
# 压缩文件的名称
zip_name = f'tonghuashun.zip'

# 要压缩的文件列表
files = []

for name in os.listdir(file_dir):
    if file_extension in name:
        files.append(os.path.join(file_dir, name))


def create_zip(files, zip_name):
    ind = 0
    with zipfile.ZipFile(zip_name, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
        for file in files:
            ind += 1
            print(ind, file, end='\r')
            zipf.write(file, os.path.basename(file))
    print(" "*100)
    print("success! zipfile:",zip_name, "file count:", ind)
create_zip(files, zip_name)
input("type any to exit!")
In [ ]:
 
In [ ]: