In [49]:
from os import replace
import re
import json
import pprint

path = r"""
C:\Users\Administrator\Desktop\O32.txt
""".strip()
res = {}
with open(path, "r", encoding="utf-8") as fr:
    def trim(ls):
        return list(map(str.strip, ls))
    content = fr.read()
    sections = re.split("#+\\s", content)
    all_tb = list(set(list(filter(lambda it: "#" not in it and len(it.strip()) > 0, content.split("\n")))))
    all_tb = trim(all_tb)
    res["all_tbs"] = all_tb
    for sec in sections:
        sps = list(filter(lambda f: len(f.strip()) > 0, sec.split("\n")))
        sps = trim(sps)
        if len(sps) > 1:
            res[sps[0].replace(" f", "")]=list(set(sps[1:]))

with open(path.replace(".txt", ".json"), "w", encoding="utf-8") as fw:
    json.dump(res, fw, ensure_ascii=False, indent=4)
The Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.
In [37]:
json.dumps(res, ensure_ascii=False, indent=4)
Out[37]:
'{\n    "all_tbs": [\n        "txtcs",\n        "thgjy_bdq",\n        "ttmp_h_gzb",\n        "thgjy",\n        "tsysinfo",\n        "trep_param",\n        "taccount",\n        "vjk_wbfk_gzb",\n        "tzqxx",\n        "tjjmrhjsj",\n        "tqsk",\n        "txm_xx",\n        "tjyds",\n        "tfundinfo"\n    ],\n    "创建交易流水": [\n        "tqsk",\n        "tzqxx",\n        "tjyds",\n        "tfundinfo",\n        "txtcs",\n        "thgjy",\n        "tzqxx",\n        "tjyds",\n        "tfundinfo",\n        "txtcs",\n        "thgjy_bdq",\n        "tsysinfo",\n        "txm_xx",\n        "txm_xx",\n        "tfundinfo",\n        "txtcs",\n        "tsysinfo"\n    ],\n    "估值表视图": [\n        "ttmp_h_gzb",\n        "taccount",\n        "tzqxx",\n        "tfundinfo",\n        "txtcs",\n        "trep_param",\n        "tsysinfo"\n    ],\n    "净值视图": [\n        "tjjmrhjsj"\n    ],\n    "使用估值表视图查询资产合计": [\n        "vjk_wbfk_gzb"\n    ]\n}'
In [ ]:
from selenium import webdriver
import sys
import requests
import json

def get_proxy():
    proxy = requests.get("http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=0&city=0&yys=0&port=1&pack=262694&ts=1&ys=0&cs=1&lb=1&sb=0&pb=45&mr=2&regions=").json()['data'][0]
    ip = proxy['ip']
    port = proxy['port']
    return ip, str(port)

all_proxy = json.load(open("all.json")) or []
usefuls = json.load(open("useful.json")) or []

def test():
    ip, port = get_proxy()
    all_proxy.append(ip + ":" + port)

    op = webdriver.ChromeOptions()
    op.add_argument('--no-sandbox')
    op.add_argument('--headless')
    op.add_argument('--disable-gpu')
    op.add_argument("user-agent='Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'")
    op.add_argument(f"--proxy-server={ip}:{port}")

    driver = webdriver.Chrome(executable_path=r'd:/chromedriver.exe', options=op)
    try:
        driver.get('https://jobs.51job.com/all/co1992612.html')
        driver.refresh()
        driver.refresh()
        if "原四川金点超越税务师事务所有限责任公司" in driver.page_source:
            usefuls.append(ip + ":" + port)
            print("USEFUL: ", ip + ":" + port)
        else:
            print(driver.page_source[:1000])
    except Exception as e:
        print("ERROR: ", e, ip + ":" + port)
    finally:
        driver.quit()
    json.dump(all_proxy, open("all.json", "w", encoding="utf-8"))
    json.dump(usefuls, open("useful.json", "w", encoding="utf-8"))

for i in range(len(all_proxy), 250):
    print("第", i, "次")
    test()
# pool.close()
# pool.join()
In [8]:
print(len(all_proxy), len(usefuls))
147 104
In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}

def test(proxy, headers):
    resp = requests.get(
                url="https://flights.ctrip.com/online/list/oneway-nkg-bjs?_=1&depdate=2022-09-10&cabin=Y_S_C_F",
                proxies={
                    "http": f"http://{proxy}",
                    "https": f"http://{proxy}",
                },
                verify=False,
                headers=headers,
                timeout=3
            )
    if (len(resp.text) > 500):
        print("[useful]", proxy)
        return True
    return False

test("65.21.141.242:10100", headers)
d:\Users\Administrator\anaconda3\envs\python3.6\lib\site-packages\urllib3\connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
d:\Users\Administrator\anaconda3\envs\python3.6\lib\site-packages\urllib3\connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
[useful] 65.21.141.242:10100
Out[2]:
True
In [3]:
from selenium import webdriver
import sys
import requests
import json
from bs4 import BeautifulSoup


resp = requests.get(
    "https://raw.githubusercontent.com/sunny9577/proxy-scraper/master/proxies.json", headers=headers, timeout=10).json()
In [ ]:
# soup = BeautifulSoup(resp)


useful = []
with open("d:/https.txt", "r", encoding="utf-8") as fdr:
    # for i in soup.select("tbody tr"):
    # for i in fdr.readlines():
        # proxy = i.strip()
    
    for i in resp["proxynova"]:
        proxy = i["ip"] + ":" + i["port"]
        # op = webdriver.ChromeOptions()
        # op.add_argument('--no-sandbox')
        # op.add_argument('--headless')
        # op.add_argument('--disable-gpu')
        # op.add_argument("user-agent='Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'")
        # op.add_argument(f"--proxy-server={proxy}")

        # driver = webdriver.Chrome(executable_path=r'd:/chromedriver.exe', options=op)
        try:
            # driver.get('https://flights.ctrip.com/online/list/oneway-nkg-bjs?_=1&depdate=2022-09-10&cabin=Y_S_C_F')
            # if (len(driver.page_source) > 500):
            #     useful.append(proxy)
            if test(proxy, headers) is True:
                useful.append(proxy)
        except Exception as e:
            print(e)
        finally:
            # driver.quit()
            pass
In [10]:
useful
Out[10]:
['101.200.127.149:3129',
 '45.167.126.249:9992',
 '190.242.36.131:999',
 '158.255.215.50:16993',
 '3.68.76.43:9999',
 '35.158.228.86:17777',
 '45.248.41.216:9812',
 '133.18.239.64:8080',
 '140.227.69.124:3180',
 '183.111.25.248:8080',
 '1.255.134.136:3128',
 '171.97.116.166:8080',
 '198.59.191.234:8080',
 '3.92.63.55:19132',
 '94.139.161.162:3128']
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [11]:
import requests
import json

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
    'accept': 'application/json',
    'content-type': 'application/json;charset=UTF-8',
    'accept-encoding': 'gzip, deflate, br',
    'cookie': '_bfa=1.1660364465030.a6edhz.1.1661478466137.1661498913719.13.59.10320605175;',
    'sign': '4b1c3e7f1ca403ced55a1a67df9e8b22',
    'transactionid': 'acd312aaf75349cc9a014f0a240a1c5a',
    'origin': 'https://flights.ctrip.com',
    'refer': 'https://flights.ctrip.com/online/list/oneway-nkg-bjs?_=1&depdate=2022-08-28&cabin=Y_S_C_F&ct=1661478472873',
}

useful = []
# proxyHost,proxyPort = proxy.strip().split(":")
proxyMeta = "http://115.209.125.169:64256"
proxyMetas = "https://115.209.125.169:64256" 
proxies = {
    "http" : proxyMeta,
    "https" : proxyMetas
}
uri = "https://flights.ctrip.com/international/search/api/search/batchSearch?v=0.5002154041180387"
params = """{"flightWayEnum":"OW","arrivalProvinceId":1,"extGlobalSwitches":{"useAllRecommendSwitch":true,"unfoldPriceListSwitch":true},"arrivalCountryName":"中国","infantCount":0,"cabin":"Y_S_C_F","cabinEnum":"Y_S_C_F","departCountryName":"中国","flightSegments":[{"departureDate":"2022-08-28","arrivalProvinceId":1,"arrivalCountryName":"中国","arrivalCountryCode":"CN","departureCityName":"南京","departureCityCode":"NKG","departureCountryName":"中国","departureCountryCode":"CN","arrivalCityName":"北京","arrivalCityCode":"BJS","departureCityTimeZone":480,"arrivalCountryId":1,"timeZone":480,"departureCityId":12,"departureCountryId":1,"arrivalCityTimeZone":480,"departureProvinceId":15,"arrivalCityId":1}],"childCount":0,"segmentNo":1,"scope":"d","adultCount":1,"extensionAttributes":{"LoggingSampling":false,"isFlightIntlNewUser":false},"transactionID":"acd312aaf75349cc9a014f0a240a1c5a","directFlight":false,"departureCityId":12,"isMultiplePassengerType":0,"noRecommend":false,"flightWay":"S","arrivalCityId":1,"departProvinceId":15}"""
try:
    # requests.get("https://www.baidu.com").raise_for_status
    req = requests.post(uri, headers=headers, data=params.encode("utf-8"), proxies=proxies, timeout=3)
    req.raise_for_status()
    print(json.loads(req.text))
    # useful.append(proxy)
except Exception as e:
    print(e)
# with open(r"D:\迅雷下载\http.txt", "r", encoding="utf-8") as fd:
    
#     for proxy in fd.readlines():
        
# print(useful)
# json.loads(req.text)["data"]["flightItineraryList"][0]["flightSegments"][0]
check_hostname requires server_hostname
In [4]:
print(useful)
[]