spider
网络爬虫2
YeeKal
•
•
"#spider"
异步 代理池
cookie
- browser_cookie3: Loads cookies used by your web browser into a cookiejar object
>>> import browser_cookie3
>>> import requests
>>> cj = browser_cookie3.chrome()
>>> r = requests.get(url, cookies=cj)
>>> get_title(r.content)
'richardpenman / home — Bitbucket'
headers
# pip3 install fake_headers
>>> from fake_headers import Headers
>>> headers = Headers(headers=True).generate()
>>> headers
{'Accept': '*/*', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64; rv:52.7.1) Gecko/20100101 Firefox/52.7.1', 'Accept-Encoding': 'gzip, deflate, br', 'Upgrade-Insecure-Requests': '1'}
>>>
Proxy
proxies={
"http":"<ip>:<port>",
"https":"<ip>:<port>"
}
requests.get(url, proxies = proxies, verify=False)
免费代理池
- github opensource project
- web
云函数代理池
# http
./scfproxy deploy http -p tencent -r ap-*
./scfproxy http -l :8066 -c ~/.config/scfproxy/cert/scfproxy.cer -k ~/.config/scfproxy/cert/scfproxy.key
# socks
./scfproxy socks -l 8066 -s 8067 -h 118.89.135.45
curl -x socks5://ip:port myip.ipip.net
-
deploy on tencent cloud
- install mitmprocy
- 安装证书 https://blog.csdn.net/u013091013/article/details/101430260
openssl x509 -in mitmproxy-ca-cert.pem -inform PEM -out mitmproxy-ca-cert.crt
- install cer certificate
sudo cp mycert.cer /usr/share/ca-certificates/mycert.pem
$ sudo dpkg-reconfigure ca-certificates
$ sudo update-ca-certificates
异步 / 协程
iohttp
同步&异步
cf = partial(singleSkuCheckAsyncProxy, session=session,sku_id=sku, product_id=product)
ff = loop.run_in_executor(None, cf)
requests
# http res
res.content # binary
res.text # string
res.json() # json result