一、导模块
import urllib.request as req
|
二、整页抓取
import urllib.request
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
print(content)
|
三、HTTPResponse类型与方法
url = 'http://www.baidu.com' response = urllib.request.urlopen(url)
|
读写方式
content = response.read() print(content)
|
content = response.read(5)
print(content)
|
content = response.readline()
|
content = response.readlines() print(content)
|
头消息返回
print(response.getcode())
|
print(response.getheaders())
|
四、下载资源
4.1 普通模式
import urllib.request
url_page = 'http://www.baidu.com'
urllib.request.urlretrieve(url_page,'baidu.html')
|
注:此方法可获取任何不具备反爬能力网站的资源,仅需修改filename参数后缀即可
4.2 反反爬模式
import urllib.request
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'}
url_picture = 'https://w.wallhaven.cc/full/l8/wallhaven-l8krdq.png'
req = urllib.request.Request(url_picture) req.add_header('User-Agent',headers['User-Agent'])
response = urllib.request.urlopen(req)
with open('img.png','wb') as f: f.write(response.read())
|
五、反爬反制
5.1 反爬手段
- User-Agent反爬
- Cookie反爬
5.2 请求头UA定制(反制1)
UA介绍:User Agent中文名用户代理,特殊字符串头,使服务器能够识别客户使用的操作系统及版本、CPU类型、浏览器及版本。。。
定制headers:
headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' }
|
由于urlopen方法可传递str或Request对象,创建Request进行定制UA
url = 'https://www.baidu.com/' headers = {'User-Agent':'见上'}
req = urllib.request.Request(url=url,headers=headers)
resp = urllib.request.urlopen(req) content = resp.read().decode('utf8')
|
5.3 GET请求
参数Unicode编码:https://cn.bing.com/search?q=%E5%91%A8%E6%9D%B0%E4%BC%A6
import urllib.parse as parse
msg = parse.quote('周杰伦') url = 'https://cn.bing.com/search?q=' + msg
|
import urllib.parse as parse
data = { 'q': '周杰伦', 'sex': '男' } data_en = parse.urlencode(data) url = 'https://cn.bing.com/search?q=' + data_en
|
5.4 POST请求
5.4.1 请求deep获取翻译
- 设置url
- 设置data,此data数据为post参数,需要进行encode编码为unicode
- 调Request获取请求
- 使用urlopen进行访问
import urllib.request import urllib.parse
base_url = 'https://dict.deepl.com/english-chinese/search?' param = { 'ajax': 1, 'source': 'english', 'onlyDictEntries': 1, 'translator': 'dnsof7h3k2lgh3gda', 'kind': 'full', 'eventkind': 'change', 'forleftside': 'true', 'il': 'zh' }
data = { 'query': 'person' }
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' }
param = urllib.parse.urlencode(param)
data = urllib.parse.urlencode(data).encode('utf-8') url = base_url + param
request = urllib.request.Request(url=url, data=data, headers=headers)
response = urllib.request.urlopen(request)
with open('deep.html', 'w', encoding='utf-8') as f: f.write('<!doctype html><html lang="en"><head><meta charset="UTF-8"><meta name="viewport"content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"><meta http-equiv="X-UA-Compatible" content="ie=edge"><title>Document</title></head><body>') f.write(response.read().decode('utf8')) f.write('</body></html>') print('end...')
|
5.4.2 请求百度翻译(反制2)
import urllib.request import urllib.parse
url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh' headers = { 'Cookie': 'BIDUPSID=06131A5CB3B3012812267B71938D92FF; PSTM=1679290628; BDUSS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BDUSS_BFESS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BAIDUID=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; MCITY=-%3A; H_PS_PSSID=38516_36553_38687_38857_38795_38792_38844_38832_38920_38806_26350; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; BA_HECTOR=8081al21a5a520248g8k8g231i9ga951o; ZFY=RLvEJ42vWfs6Nn1toRsSDPvUf6qUev:BbcKE8XtMjilw:C; PSINO=5; delPer=0; BDRCVFR[feWj1Vr5u3D]=mk3SLVN4HKm; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; ab_sr=1.0.1_ZDllODk0MGIwMDljMDY0ZGNlNWY4OWYyZTljZjBiODY0ZDcyM2U1MjYyZDIyNDFkODE3ZTJiYTIwMDI0ZGFmNjk3M2JjZDU2NzZjMjc4N2VkNmY3NzlmOTY5M2ZiZDU3NjU5NzViOTQyODQ5OGQ1ODgxZTcxZmM5ZDJjMGYxMTg2OTEzNTJhN2Y2NTgxNjI3MzM0OGJlMmExZDVkOTRmNWI5YTkxNTMyZjk5YWZjNWIwNWE4Mzk3ODk3ODlhYTgz', } data = { 'from': 'en', 'to': 'zh', 'query': 'spider', 'simple_means_flag': '3', 'sign': '63766.268839', 'token': '05fb9f025ae9f430c5c6be7f1556f3ea', 'domain': 'common', 'ts': '1687699004079', }
data = urllib.parse.urlencode(data).encode('utf8')
request = urllib.request.Request(url=url, data=data, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
import json
obj = json.loads(content) print(obj)
|
百度翻译隐藏接口:https://fanyi.baidu.com/sug?kw=eye
5.4.3 POST数据正则替换
notepad++ 查找目标:(.*): (.*) 替换为:'\1':'\2',
|
html源数据
from: en to: zh query: spider simple_means_flag: 3 sign: 63766.268839 token: 05fb9f025ae9f430c5c6be7f1556f3ea domain: common ts: 1687699004079
|
正则替换后
'from':'en', 'to':'zh', 'query':'spider', 'simple_means_flag':'3', 'sign':'63766.268839', 'token':'05fb9f025ae9f430c5c6be7f1556f3ea', 'domain':'common', 'ts':'1687699004079',
|
5.5 ajax
5.5.1 get
import urllib.request import urllib.parse
def create_request(page): base_url = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' }
param = { 'start': (page-1)*20, 'limit': 20, } param = urllib.parse.urlencode(param) url = base_url+param request = urllib.request.Request(url=url,headers=headers) return request
def get_content(request): response = urllib.request.urlopen(request) content = response.read().decode('utf-8') return content
def downloads(page, content): with open('.\\douban\\douban_'+str(page)+'.json','w',encoding='utf-8') as f: f.write(content)
if __name__ == '__main__': print('豆瓣网抓取') start_page = int(input('请输入起始页码:')) end_page = int(input('请输入结束页码:')) for page in range(start_page, end_page + 1): request = create_request(page) content = get_content(request) downloads(page,content)
|
5.5.2 post
import urllib.request import urllib.parse
def create_request(page): url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' }
data = { 'cname': '北京', 'pid': '', 'pageIndex': page, 'pageSize': '10', } data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url, data=data, headers=headers) return request
def get_content(request): response = urllib.request.urlopen(request) content = response.read().decode('utf-8') return content
def download(page,content): with open('.\\kfc\\kfc_'+str(page)+'.json','w',encoding='utf-8') as f: f.write(content)
if __name__ == '__main__': print('肯德基区域店面抓取') start_page = int(input('请输入起始页码:')) end_page = int(input('请输入结束页码:')) for page in range(start_page, end_page + 1): requset = create_request(page) content = get_content(requset) download(page,content) print(content)
|
六、爬虫异常
6.1 导入模块
6.2 HTTPError
路径错误 : 错误的url,解决报错异常
6.3 URLError
服务器或端口错误 : 错误的url,解决报错异常
6.4 捕获异常
try: ... except urllib.error.HTTPError: ...
|
6.5 案例
import urllib.request import urllib.error
url = 'http://www.goudan111.com'
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' }
try: request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
except urllib.error.HTTPError: print('系统正在升级...')
except urllib.error.URLError: print('第二种升级方案...')
|
七、Cookie登录
绕过页面登录抓取数据
场景分析:
import urllib.request
url = 'https://weibo.com/u/7074461820'
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' }
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
print(content)
|
报错原因:此用户页面是utf-8编码,但未登录会跳转到登录页,登录页的编码不是utf-8
7.1 错误更正
修改response接收的编码格式
content = response.read().decode('gb2312')
|
异常:发现获取到的数据是登录页面的
访问失败原因:请求头信息不够,无法正确访问
7.2 正确修改
添加Cookie信息
Cookie: SINAGLOBAL=6664999447883.047.1685427483001; ULV=1685427483097:1:1:1:6664999447883.047.1685427483001:; XSRF-TOKEN=fixtHOwhdlfU4ShqjmpCjMAz; SUB=_2A25JmmvSDeRhGeFO7FYV9i_EyTyIHXVq7toarDV8PUNbmtANLWzDkW9NQUR46RjEnGZiDiVKFwdERkwUGCyDr3pR; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5lDnHA1MVrHu_3_ljBSdRy5JpX5KzhUgL.FoM7S0BXSo2Reo52dJLoI7f2MJ8rdc4kBJ8_qJva; ALF=1719619330; SSOLoginState=1688083330; WBPSESS=VUh1dQeaNE_DJ6H7aSNQDBgox7Vik0e5-Iwx3WN-nk01w7SoOcEhzwg5oULcHibaNwgdibVmopSq389wc5bSyv-8bcP8qSHgm98oqg_e-RMoiAj25rHZBCRDNbcLkDBRGlJDSuD9Rhv9kaNSE-zdpw==
|
代码
import urllib.request
url = 'https://weibo.com/u/7074461820'
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Cookie': 'SINAGLOBAL=6664999447883.047.1685427483001; ULV=1685427483097:1:1:1:6664999447883.047.1685427483001:; XSRF-TOKEN=fixtHOwhdlfU4ShqjmpCjMAz; SUB=_2A25JmmvSDeRhGeFO7FYV9i_EyTyIHXVq7toarDV8PUNbmtANLWzDkW9NQUR46RjEnGZiDiVKFwdERkwUGCyDr3pR; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5lDnHA1MVrHu_3_ljBSdRy5JpX5KzhUgL.FoM7S0BXSo2Reo52dJLoI7f2MJ8rdc4kBJ8_qJva; ALF=1719619330; SSOLoginState=1688083330; WBPSESS=VUh1dQeaNE_DJ6H7aSNQDBgox7Vik0e5-Iwx3WN-nk01w7SoOcEhzwg5oULcHibaNwgdibVmopSq389wc5bSyv-8bcP8qSHgm98oqg_e-RMoiAj25rHZBCRDNbcLkDBRGlJDSuD9Rhv9kaNSE-zdpw==' }
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
|
原因:cookie中携带了登录信息,有了登录之后的cookie才能携带cookie进入到任何页面
八、handler处理器
urllib.request.urlopen(url) : 不能定制请求头
urllib.request.Request(url,headers,data) : 可以定制请求头
handler : 定制更高级的请求头
使用 :
- 获取handler对象 handler = urllib.request.HTTPHandler()
- 获取opener对象 opener = urllib.request.build_opener(handler)
- 调用open对象 response = opener.open(request)
案例:
import urllib.request
url = 'http://www.baidu.com'
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }
request = urllib.request.Request(url=url, headers=headers)
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
print(content)
|
九、代理
- 突破自身IP访问显示,访问外网
- 访问一些单位或团体内部资源
- 提高访问速度
- 隐藏真实IP,免受攻击
步骤
request = urllib.request.Request(url=url, headers=headers)
设置proxies字典
proxies = { 'http': '182.139.110.14:9000' }
|
使用处理器 handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
案例
import urllib.request
url = 'http://www.baidu.com/s?wd=ip'
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Cookie': 'BIDUPSID=06131A5CB3B3012812267B71938D92FF; PSTM=1679290628; BDUSS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BDUSS_BFESS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BAIDUID=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; BD_HOME=1; H_PS_PSSID=36553_38857_38795_38958_38955_38832_38920_38806_38989_26350; BD_UPN=12314753; BA_HECTOR=ah018k040l8g018h80a18g8b1i9vg631o; BAIDUID_BFESS=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; ZFY=BXOBRbhZzq3:AgQrE2UdwmoKEVxRJv4XVrrWQOLZv9lE:C; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1; PSINO=5; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; baikeVisitId=a33ef4a5-bd67-4b3a-ac66-6f07ca3fee30; B64_BOT=1; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=74e2BAm6baEmykvFdTDJUKuT698Uocc37KZSVsBfg2KJ3c0eUvWu9sHBIXU', }
proxies = { 'http': '182.139.110.14:9000' } request = urllib.request.Request(url=url, headers=headers)
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
with open('proxy/proxy01.html', 'w', encoding='utf-8') as fp: fp.write(content)
|
9.1 代理池创建
import urllib.request import random
proxies_pool = [ { 'http': '182.139.110.14:9000' }, { 'http': '182.139.110.14:9001' }, { 'http': '182.139.110.14:9002' }, { 'http': '182.139.110.14:9003' }, { 'http': '182.139.110.14:9004' }, { 'http': '182.139.110.14:9005' } ]
proxies = random.choice(proxies_pool)
url = 'http://www.baidu.com/s?wd=ip'
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Cookie': 'BIDUPSID=06131A5CB3B3012812267B71938D92FF; PSTM=1679290628; BDUSS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BDUSS_BFESS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BAIDUID=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; BD_HOME=1; H_PS_PSSID=36553_38857_38795_38958_38955_38832_38920_38806_38989_26350; BD_UPN=12314753; BA_HECTOR=ah018k040l8g018h80a18g8b1i9vg631o; BAIDUID_BFESS=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; ZFY=BXOBRbhZzq3:AgQrE2UdwmoKEVxRJv4XVrrWQOLZv9lE:C; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1; PSINO=5; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; baikeVisitId=a33ef4a5-bd67-4b3a-ac66-6f07ca3fee30; B64_BOT=1; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=74e2BAm6baEmykvFdTDJUKuT698Uocc37KZSVsBfg2KJ3c0eUvWu9sHBIXU', }
request = urllib.request.Request(url=url,headers=headers)
handler = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
|
十、获取网页部分源码手段
10.1 xpath
- 浏览器添加XPath Helper.crx插件
- 浏览器不支持crx文件时直接修改后缀为zip重新拖拽
- 使用方式:网页ctrl+shift+x出现小黑框
- 此插件用于在网页写xpath语法来验证自己需要获取的数据是否正确
- 安装lxml库
- 导入etree包
- xpath解析
- 本地文件 html_tree = etree.parse(‘xxx.html’)
- 服务器响应数据html_tree = etree.HTML(response.read().decode(‘utf-8’))
- 获取数据
- xpath基本语法
- 路径查询
- // : 查找所有子孙节点,不考虑层级关系
- / : 找直接子节点
- 谓语查询
- //div[@id]
- //div[@id=”maincontent”]
- 属性查询 /@class 也可以是别的属性
- 模糊查询
- //div[contains(@id,”he”)]
- //div[starts-with(@id,”he”)]
- 内容查询 /text()
- 这个text()可以获取html标签中的innerText数据
- //div/h1/text()
- 逻辑运算
- //div[@id=”head” and @class=”s_down”]
- title | //price
10.1.1 xpath语法使用
获取该网页下class为newsbody下的class为nbody的文本
nbodys = tree.xpath('//div[@class="newsbody"]//div[@class="nbodys"]/text()')
|
获取class
nbodys_class = tree.xpath('//div[@class="newsbody"]//div[@class="nbodys"]/@class')
|
10.1.2 示例
import urllib.request from lxml import etree
url = 'https://www.bbiquge.net/book/133303/' page = 'index_1.html'
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }
request = urllib.request.Request(url=url+page,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('gbk')
tree = etree.HTML(content) print(tree) hrefs = tree.xpath('//div[@class="zjbox"]/dl[@class="zjlist"]/dd/a/@href')
urls = []
for i in range(len(hrefs)): urls.append(url+'/'+hrefs[i])
for url in urls: print(url)
|
10.1.3 大型xpath抓取下载至本地
import urllib.request from lxml import etree
def create_request(page): if page == 1: url = 'https://sc.chinaz.com/tupian/meinvxiezhen.html' else: url = 'https://sc.chinaz.com/tupian/meinvxiezhen_' + str(page) + '.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' } request = urllib.request.Request(url=url,headers=headers) return request
def get_content(request): response = urllib.request.urlopen(request) content = response.read().decode('utf-8') return content
def down_load(content): tree = etree.HTML(content) name_list = tree.xpath('//div[@class="container"]//div/img/@alt') src_list = tree.xpath('//div[@class="container"]//div/img/@data-original')
for i in range(len(name_list)): name = name_list[i] src = src_list[i] url = 'http:'+src urllib.request.urlretrieve(url=url,filename='img/'+name+'.jpg')
if __name__ == '__main__': start_page = int(input('请输入起始页码:')) end_page = int(input('请输入结束页码:'))
for page in range(start_page, end_page + 1): request = create_request(page) content = get_content(request) down_load(content) print('第'+str(page)+'下载完成')
|
10.2 JsonPath
pip安装
使用说明(获取json对象)
- 本地json文件获取对象
- obj = json.load(open(‘json文件’,’r’,encoding=’utf-8’))
- import json
- 如果是网络资源的json保存到本地再使用
obj = json.load(open('jsonpath.json','r',encoding='utf-8'))
|
- 使用jsonpath
- ret = jsonpath.jsonpath(obj,’jsonpath语法’)
- import jsonpath
案例:获取淘票票所有城市
import urllib.request import json import jsonpath
url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1688282177270_108&jsoncallback=jsonp109&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'
headers = { 'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Bx-V': '2.5.0', 'Cookie': 'cna=6zKQHGg562ACATswb40Y5qgs; t=62bc0d82720dc60b137ad4f1304cd5ee; xlly_s=1; cookie2=13f702bb8b29a0ea7a1f19121af7e5b6; v=0; _tb_token_=5e8d43fe773f; tfstk=dDSp3DYxg5VhsMe8xBUg4DXIxpwgoMBUK6WjqQAnP1COg6eer2bhybdOhJR3zy5RW_1w-BsRU0p5F_yeZJzGL9-yVSVc2oXFLWTiX3BS59nVR3N0io0iCePHVg2TeLIMBfn_gPofcgvQPLKrgYK4FpYpGOUqq0oHCjAAd4ijDGQ9JOyAmiER2jAvE2wTB4uyRdPvO88C.; l=fBO47mseN_qPmZjaBO5CFurza77OmQRb8sPzaNbMiIEGa1mctFsHZNC1c_XMSdtjgTfURexyVhmX9dEplCUd_giMW_N-1NKcFYJ6-bpU-L5..; isg=BOjoROdYGCr4czQ8JTX6KMlNudb6EUwbtnsuNKIbqWNW_YlnSCcPq1Bz9ZUNTQTz', 'Referer': 'https://dianying.taobao.com/', 'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Windows"', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', }
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8') result = content.split('(')[1][:-2]
obj = json.load(open('taopiaopiao.json', 'r', encoding='utf-8')) region_names = jsonpath.jsonpath(obj,'$..regionName') print(region_names)
|
10.3 BeautifulSoup
缺点:没有lxml效率高
优点:使用方便
安装 pip install bs4
导入 from bs4 import BeautifulSoup
创建对象
soup = BeautifulSoup(response.read().decode(), 'lxml')
|
soup = BeautifulSoup(open('1.html'), 'lxml')
|
10.3.1 节点定位
根据标签名查找节点
- soup.a
- 只能找到第一个a,后面加属性名可以获取它的值
- soup.a.name
- soup.a.attrs
函数
.find(返回一个对象)
- find(‘a’) : 只找到第一个a标签
- find(‘a’, title=’名字’)
- find(‘a’, class_=’名字’)
.find_all(返回一个列表)
- find_all(‘a’) : 查到所有的a
- find_all([‘a’, ‘span’]) : 返回所有的a和span
- fnd_all(‘a’, limit=2) : 只查找前两个a
.select(根据选择器得到节点对象) 【推荐】
element
.class
#id
属性选择器
soup.select('li[id="l2"]')
|
层级选择器
- 后代 element element
- 子代 element>element
- 同级 element,element
10.3.2 节点信息
获取节点内容
- obj.string
- obj.get_text() 【推荐】
节点的属性
tag.name 获取标签名
eg:tag = find(‘li’)
print(tag.name)
tag.attrs将属性值作为一个字典返回
获取节点属性
- obj.attrs.get(‘title’) 【常用】
- obj.get(‘title’)
- obj[‘title’]
十一、反扒手段统计
验证浏览器信息
11.2 cookie反爬
验证登录信息
11.3 referer反爬
判断是否由某网站跳转,一般情况做图片防盗链