In [49]:
from os import replace
import re
import json
import pprint
path = r"""
C:\Users\Administrator\Desktop\O32.txt
""".strip()
res = {}
with open(path, "r", encoding="utf-8") as fr:
def trim(ls):
return list(map(str.strip, ls))
content = fr.read()
sections = re.split("#+\\s", content)
all_tb = list(set(list(filter(lambda it: "#" not in it and len(it.strip()) > 0, content.split("\n")))))
all_tb = trim(all_tb)
res["all_tbs"] = all_tb
for sec in sections:
sps = list(filter(lambda f: len(f.strip()) > 0, sec.split("\n")))
sps = trim(sps)
if len(sps) > 1:
res[sps[0].replace(" f", "")]=list(set(sps[1:]))
with open(path.replace(".txt", ".json"), "w", encoding="utf-8") as fw:
json.dump(res, fw, ensure_ascii=False, indent=4)
The Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.
In [37]:
json.dumps(res, ensure_ascii=False, indent=4)
Out[37]:
'{\n "all_tbs": [\n "txtcs",\n "thgjy_bdq",\n "ttmp_h_gzb",\n "thgjy",\n "tsysinfo",\n "trep_param",\n "taccount",\n "vjk_wbfk_gzb",\n "tzqxx",\n "tjjmrhjsj",\n "tqsk",\n "txm_xx",\n "tjyds",\n "tfundinfo"\n ],\n "创建交易流水": [\n "tqsk",\n "tzqxx",\n "tjyds",\n "tfundinfo",\n "txtcs",\n "thgjy",\n "tzqxx",\n "tjyds",\n "tfundinfo",\n "txtcs",\n "thgjy_bdq",\n "tsysinfo",\n "txm_xx",\n "txm_xx",\n "tfundinfo",\n "txtcs",\n "tsysinfo"\n ],\n "估值表视图": [\n "ttmp_h_gzb",\n "taccount",\n "tzqxx",\n "tfundinfo",\n "txtcs",\n "trep_param",\n "tsysinfo"\n ],\n "净值视图": [\n "tjjmrhjsj"\n ],\n "使用估值表视图查询资产合计": [\n "vjk_wbfk_gzb"\n ]\n}'
In [ ]:
from selenium import webdriver
import sys
import requests
import json
def get_proxy():
proxy = requests.get("http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=0&city=0&yys=0&port=1&pack=262694&ts=1&ys=0&cs=1&lb=1&sb=0&pb=45&mr=2®ions=").json()['data'][0]
ip = proxy['ip']
port = proxy['port']
return ip, str(port)
all_proxy = json.load(open("all.json")) or []
usefuls = json.load(open("useful.json")) or []
def test():
ip, port = get_proxy()
all_proxy.append(ip + ":" + port)
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--headless')
op.add_argument('--disable-gpu')
op.add_argument("user-agent='Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'")
op.add_argument(f"--proxy-server={ip}:{port}")
driver = webdriver.Chrome(executable_path=r'd:/chromedriver.exe', options=op)
try:
driver.get('https://jobs.51job.com/all/co1992612.html')
driver.refresh()
driver.refresh()
if "原四川金点超越税务师事务所有限责任公司" in driver.page_source:
usefuls.append(ip + ":" + port)
print("USEFUL: ", ip + ":" + port)
else:
print(driver.page_source[:1000])
except Exception as e:
print("ERROR: ", e, ip + ":" + port)
finally:
driver.quit()
json.dump(all_proxy, open("all.json", "w", encoding="utf-8"))
json.dump(usefuls, open("useful.json", "w", encoding="utf-8"))
for i in range(len(all_proxy), 250):
print("第", i, "次")
test()
# pool.close()
# pool.join()
In [8]:
print(len(all_proxy), len(usefuls))
147 104
In [2]:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
def test(proxy, headers):
resp = requests.get(
url="https://flights.ctrip.com/online/list/oneway-nkg-bjs?_=1&depdate=2022-09-10&cabin=Y_S_C_F",
proxies={
"http": f"http://{proxy}",
"https": f"http://{proxy}",
},
verify=False,
headers=headers,
timeout=3
)
if (len(resp.text) > 500):
print("[useful]", proxy)
return True
return False
test("65.21.141.242:10100", headers)
d:\Users\Administrator\anaconda3\envs\python3.6\lib\site-packages\urllib3\connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings InsecureRequestWarning) d:\Users\Administrator\anaconda3\envs\python3.6\lib\site-packages\urllib3\connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings InsecureRequestWarning)
[useful] 65.21.141.242:10100
Out[2]:
True
In [3]:
from selenium import webdriver
import sys
import requests
import json
from bs4 import BeautifulSoup
resp = requests.get(
"https://raw.githubusercontent.com/sunny9577/proxy-scraper/master/proxies.json", headers=headers, timeout=10).json()
In [ ]:
# soup = BeautifulSoup(resp)
useful = []
with open("d:/https.txt", "r", encoding="utf-8") as fdr:
# for i in soup.select("tbody tr"):
# for i in fdr.readlines():
# proxy = i.strip()
for i in resp["proxynova"]:
proxy = i["ip"] + ":" + i["port"]
# op = webdriver.ChromeOptions()
# op.add_argument('--no-sandbox')
# op.add_argument('--headless')
# op.add_argument('--disable-gpu')
# op.add_argument("user-agent='Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'")
# op.add_argument(f"--proxy-server={proxy}")
# driver = webdriver.Chrome(executable_path=r'd:/chromedriver.exe', options=op)
try:
# driver.get('https://flights.ctrip.com/online/list/oneway-nkg-bjs?_=1&depdate=2022-09-10&cabin=Y_S_C_F')
# if (len(driver.page_source) > 500):
# useful.append(proxy)
if test(proxy, headers) is True:
useful.append(proxy)
except Exception as e:
print(e)
finally:
# driver.quit()
pass
In [10]:
useful
Out[10]:
['101.200.127.149:3129', '45.167.126.249:9992', '190.242.36.131:999', '158.255.215.50:16993', '3.68.76.43:9999', '35.158.228.86:17777', '45.248.41.216:9812', '133.18.239.64:8080', '140.227.69.124:3180', '183.111.25.248:8080', '1.255.134.136:3128', '171.97.116.166:8080', '198.59.191.234:8080', '3.92.63.55:19132', '94.139.161.162:3128']
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [11]:
import requests
import json
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
'accept': 'application/json',
'content-type': 'application/json;charset=UTF-8',
'accept-encoding': 'gzip, deflate, br',
'cookie': '_bfa=1.1660364465030.a6edhz.1.1661478466137.1661498913719.13.59.10320605175;',
'sign': '4b1c3e7f1ca403ced55a1a67df9e8b22',
'transactionid': 'acd312aaf75349cc9a014f0a240a1c5a',
'origin': 'https://flights.ctrip.com',
'refer': 'https://flights.ctrip.com/online/list/oneway-nkg-bjs?_=1&depdate=2022-08-28&cabin=Y_S_C_F&ct=1661478472873',
}
useful = []
# proxyHost,proxyPort = proxy.strip().split(":")
proxyMeta = "http://115.209.125.169:64256"
proxyMetas = "https://115.209.125.169:64256"
proxies = {
"http" : proxyMeta,
"https" : proxyMetas
}
uri = "https://flights.ctrip.com/international/search/api/search/batchSearch?v=0.5002154041180387"
params = """{"flightWayEnum":"OW","arrivalProvinceId":1,"extGlobalSwitches":{"useAllRecommendSwitch":true,"unfoldPriceListSwitch":true},"arrivalCountryName":"中国","infantCount":0,"cabin":"Y_S_C_F","cabinEnum":"Y_S_C_F","departCountryName":"中国","flightSegments":[{"departureDate":"2022-08-28","arrivalProvinceId":1,"arrivalCountryName":"中国","arrivalCountryCode":"CN","departureCityName":"南京","departureCityCode":"NKG","departureCountryName":"中国","departureCountryCode":"CN","arrivalCityName":"北京","arrivalCityCode":"BJS","departureCityTimeZone":480,"arrivalCountryId":1,"timeZone":480,"departureCityId":12,"departureCountryId":1,"arrivalCityTimeZone":480,"departureProvinceId":15,"arrivalCityId":1}],"childCount":0,"segmentNo":1,"scope":"d","adultCount":1,"extensionAttributes":{"LoggingSampling":false,"isFlightIntlNewUser":false},"transactionID":"acd312aaf75349cc9a014f0a240a1c5a","directFlight":false,"departureCityId":12,"isMultiplePassengerType":0,"noRecommend":false,"flightWay":"S","arrivalCityId":1,"departProvinceId":15}"""
try:
# requests.get("https://www.baidu.com").raise_for_status
req = requests.post(uri, headers=headers, data=params.encode("utf-8"), proxies=proxies, timeout=3)
req.raise_for_status()
print(json.loads(req.text))
# useful.append(proxy)
except Exception as e:
print(e)
# with open(r"D:\迅雷下载\http.txt", "r", encoding="utf-8") as fd:
# for proxy in fd.readlines():
# print(useful)
# json.loads(req.text)["data"]["flightItineraryList"][0]["flightSegments"][0]
check_hostname requires server_hostname
In [4]:
print(useful)
[]