一、导模块

import urllib.request as req

二、整页抓取

# 使用urllib获取百度首页源码
import urllib.request

# 访问网站
url = 'http://www.baidu.com'

# 对服务器发送请求 response响应
response = urllib.request.urlopen(url)

# 获取响应中的页面源码
content = response.read().decode('utf-8')

# 打印数据
print(content)

三、HTTPResponse类型与方法

url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
# <class 'http.client.HTTPResponse'>

读写方式

  • read()
# 按照一个字节一个字节进行读
content = response.read()
print(content)
  • read(n)
# 读五个字节
content = response.read(5)
# b'<!DOC'
print(content)
  • readline()
# 读取一行
content = response.readline()
  • readlines()
# 以行为单位读取响应页面,组成list
content = response.readlines()
print(content)

头消息返回

  • getcode()
# 返回状态码
# 200
print(response.getcode())
  • geturl()
# 返回地址
# http://www.baidu.com
print(response.geturl())
  • getheaders()
# 获取状态信息
# [('Connection', 'close'), ('Transfer-Encoding', 'chunked'), ('Bdpagetype', '1'), ('Bdqid', '0xdee6206e000947e5'), ('Content-Security-Policy', "frame-ancestors 'self' https://chat.baidu.com http://mirror-chat.baidu.com https://fj-chat.baidu.com https://hba-chat.baidu.com https://hbe-chat.baidu.com https://njjs-chat.baidu.com https://nj-chat.baidu.com https://hna-chat.baidu.com https://hnb-chat.baidu.com http://debug.baidu-int.com;"), ('Content-Type', 'text/html; charset=utf-8'), ('Date', 'Sun, 25 Jun 2023 07:43:20 GMT'), ('P3p', 'CP=" OTI DSP COR IVA OUR IND COM "'), ('P3p', 'CP=" OTI DSP COR IVA OUR IND COM "'), ('Server', 'BWS/1.1'), ('Set-Cookie', 'BAIDUID=8F8527EF736D370F80FA1FE3708A6D99:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'BIDUPSID=8F8527EF736D370F80FA1FE3708A6D99; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'PSTM=1687679000; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'BAIDUID=8F8527EF736D370FAA8B1A78468BD03D:FG=1; max-age=31536000; expires=Mon, 24-Jun-24 07:43:20 GMT; domain=.baidu.com; path=/; version=1; comment=bd'), ('Set-Cookie', 'BDSVRTM=0; path=/'), ('Set-Cookie', 'BD_HOME=1; path=/'), ('Set-Cookie', 'H_PS_PSSID=38516_36560_38687_38858_38796_38903_38842_38577_38813_38639_26350; path=/; domain=.baidu.com'), ('Traceid', '1687679000282143540216061560777882552293'), ('Vary', 'Accept-Encoding'), ('X-Ua-Compatible', 'IE=Edge,chrome=1')]
print(response.getheaders())

四、下载资源

4.1 普通模式

import urllib.request

# 下载网页
url_page = 'http://www.baidu.com'
# url表示下载路径,filename表示文件名
# 1.调用urlretrieve(url,filename)此方法针对非反爬虫程序
urllib.request.urlretrieve(url_page,'baidu.html')

注:此方法可获取任何不具备反爬能力网站的资源,仅需修改filename参数后缀即可

4.2 反反爬模式

import urllib.request

# 下载图片

# 反爬措施(头信息)反制:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'}

# 图片资源地址
url_picture = 'https://w.wallhaven.cc/full/l8/wallhaven-l8krdq.png'

# 获取图片资源地址并封入反爬headers反制
req = urllib.request.Request(url_picture)
req.add_header('User-Agent',headers['User-Agent'])

# 访问资源路径获取响应
response = urllib.request.urlopen(req)

# 文件写入至img.png
# with作用为在文件使用结束后自动关闭文件
# as f为open方法的返回值
with open('img.png','wb') as f:
f.write(response.read())

五、反爬反制

5.1 反爬手段

  1. User-Agent反爬
  2. Cookie反爬

5.2 请求头UA定制(反制1)

UA介绍:User Agent中文名用户代理,特殊字符串头,使服务器能够识别客户使用的操作系统及版本、CPU类型、浏览器及版本。。。

定制headers:

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

由于urlopen方法可传递str或Request对象,创建Request进行定制UA

url = 'https://www.baidu.com/'
headers = {'User-Agent':'见上'}
# 参数顺序问题,需要显式设置参数类型
req = urllib.request.Request(url=url,headers=headers)
# 可分散书写
# req = urllib.request.Request(url)
# req.add_header('User-Agent',headers['User-Agent'])
resp = urllib.request.urlopen(req)
content = resp.read().decode('utf8')

5.3 GET请求

参数Unicode编码:https://cn.bing.com/search?q=%E5%91%A8%E6%9D%B0%E4%BC%A6

  • quote(单参数)
import urllib.parse as parse

msg = parse.quote('周杰伦') # 获取编码格式
url = 'https://cn.bing.com/search?q=' + msg

# https://cn.bing.com/search?q=%E5%91%A8%E6%9D%B0%E4%BC%A6
  • url(多参数)
# 解决多参数情况

import urllib.parse as parse

data = {
'q': '周杰伦',
'sex': '男'
}
data_en = parse.urlencode(data)
url = 'https://cn.bing.com/search?q=' + data_en

# https://cn.bing.com/search?q=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7

5.4 POST请求

5.4.1 请求deep获取翻译

  1. 设置url
  2. 设置data,此data数据为post参数,需要进行encode编码为unicode
  3. 调Request获取请求
  4. 使用urlopen进行访问
import urllib.request
import urllib.parse

# 1.
base_url = 'https://dict.deepl.com/english-chinese/search?'
param = {
'ajax': 1,
'source': 'english',
'onlyDictEntries': 1,
'translator': 'dnsof7h3k2lgh3gda',
'kind': 'full',
'eventkind': 'change',
'forleftside': 'true',
'il': 'zh'
}

# 2.
data = {
'query': 'person'
}

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

# 3.
param = urllib.parse.urlencode(param)
# post请求需要进行unicode编码,否则报错
data = urllib.parse.urlencode(data).encode('utf-8')
url = base_url + param

# 4.post提交
request = urllib.request.Request(url=url, data=data, headers=headers)
# print(request)
response = urllib.request.urlopen(request)

with open('deep.html', 'w', encoding='utf-8') as f:
f.write('<!doctype html><html lang="en"><head><meta charset="UTF-8"><meta name="viewport"content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"><meta http-equiv="X-UA-Compatible" content="ie=edge"><title>Document</title></head><body>')
f.write(response.read().decode('utf8'))
f.write('</body></html>')
print('end...')

5.4.2 请求百度翻译(反制2)

import urllib.request
import urllib.parse

url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
headers = {
# 'Accept': '*/*',
# 爬虫编写时注释掉此接收编码
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Acs-Token': '1687699004668_1687699004103_kLE706Zo9A39/QzZ9HtVYm/vdl0Zj1TfusbR0BJ5sTasxvycLvABkoc+1b6TbiysG8nnXLixN16AH5z42O8H7U4R3Z388JLQ5mGcMqNr8MnJR13mnuLIgltWgB4XcG57Cp/jQXUo4KA1iF5pha+EYgtHAYoTrUKQ4EiCeBzj66Ibabwxe1YHKprcKQm7GahCMQ/iWEpgSmI1XZPlUGRzHN/eNId4tARiNvADUSmAOA5Re8i6v5ptXWuXPhyDh5d0ij465bTSNd5FJLZkQ699l+ZfFz0tZiyAp6SzkWwKtGOJkUVEdXFXWB/L/F3B1z+4BNliD3v1Uy65Ec/+sMj6R+x95Qsdn/3MXGsxkROFmdXr/tQQIgTFv3TVB48BybZNCsAH2RZUCUFszHU+xANONueA0S5WxlvQoGn80I4+msg3autEzT26nZ3lMmi2UTgP3LYA85ArluKINkmBXd7uJlRpTjfFKSQMKT3qPuA1r5GxEcCfd+n5JSQafy5bxr2H',
# 'Connection': 'keep-alive',
# 'Content-Length': '134',
# 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'BIDUPSID=06131A5CB3B3012812267B71938D92FF; PSTM=1679290628; BDUSS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BDUSS_BFESS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BAIDUID=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; MCITY=-%3A; H_PS_PSSID=38516_36553_38687_38857_38795_38792_38844_38832_38920_38806_26350; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; BA_HECTOR=8081al21a5a520248g8k8g231i9ga951o; ZFY=RLvEJ42vWfs6Nn1toRsSDPvUf6qUev:BbcKE8XtMjilw:C; PSINO=5; delPer=0; BDRCVFR[feWj1Vr5u3D]=mk3SLVN4HKm; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; ab_sr=1.0.1_ZDllODk0MGIwMDljMDY0ZGNlNWY4OWYyZTljZjBiODY0ZDcyM2U1MjYyZDIyNDFkODE3ZTJiYTIwMDI0ZGFmNjk3M2JjZDU2NzZjMjc4N2VkNmY3NzlmOTY5M2ZiZDU3NjU5NzViOTQyODQ5OGQ1ODgxZTcxZmM5ZDJjMGYxMTg2OTEzNTJhN2Y2NTgxNjI3MzM0OGJlMmExZDVkOTRmNWI5YTkxNTMyZjk5YWZjNWIwNWE4Mzk3ODk3ODlhYTgz',
# 'Host': 'fanyi.baidu.com',
# 'Origin': 'https://fanyi.baidu.com',
# 'Referer': 'https://fanyi.baidu.com/',
# 'Sec-Fetch-Dest': 'empty',
# 'Sec-Fetch-Mode': 'cors',
# 'Sec-Fetch-Site': 'same-origin',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
# 'X-Requested-With': 'XMLHttpRequest',
# 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"',
}
data = {
'from': 'en',
'to': 'zh',
'query': 'spider',
'simple_means_flag': '3',
'sign': '63766.268839',
'token': '05fb9f025ae9f430c5c6be7f1556f3ea',
'domain': 'common',
'ts': '1687699004079',
}
# post请求参数编码
data = urllib.parse.urlencode(data).encode('utf8')

# 请求对象定制
request = urllib.request.Request(url=url, data=data, headers=headers)

# 模拟浏览器发送数据
response = urllib.request.urlopen(request)

# 获取响应数据
content = response.read().decode('utf-8')

import json

obj = json.loads(content)
print(obj)

百度翻译隐藏接口:https://fanyi.baidu.com/sug?kw=eye

5.4.3 POST数据正则替换

notepad++
查找目标:(.*): (.*)
替换为:'\1':'\2',

html源数据

from: en
to: zh
query: spider
simple_means_flag: 3
sign: 63766.268839
token: 05fb9f025ae9f430c5c6be7f1556f3ea
domain: common
ts: 1687699004079

正则替换后

'from':'en',
'to':'zh',
'query':'spider',
'simple_means_flag':'3',
'sign':'63766.268839',
'token':'05fb9f025ae9f430c5c6be7f1556f3ea',
'domain':'common',
'ts':'1687699004079',

5.5 ajax

5.5.1 get

import urllib.request
import urllib.parse

def create_request(page):
# 创建基础url
base_url = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&'

# 创建header头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

# 创建url参数信息
param = {
'start': (page-1)*20,
'limit': 20,
}
# 将url信息进行解析&拼接
param = urllib.parse.urlencode(param)

# 拼接url
url = base_url+param

# 获取get请求的request对象
request = urllib.request.Request(url=url,headers=headers)
return request

def get_content(request):
# 请求网页返回响应数据
response = urllib.request.urlopen(request)
# 将响应数据接收并解码
content = response.read().decode('utf-8')
return content


def downloads(page, content):
# 写出爬取的数据
with open('.\\douban\\douban_'+str(page)+'.json','w',encoding='utf-8') as f:
f.write(content)


if __name__ == '__main__':
print('豆瓣网抓取')
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
for page in range(start_page, end_page + 1):
request = create_request(page)
content = get_content(request)
downloads(page,content)

5.5.2 post

import urllib.request
import urllib.parse


def create_request(page):
# 设置基础url
url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'

# 设置header
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

# 设置post参数
data = {
'cname': '北京',
'pid': '',
'pageIndex': page,
'pageSize': '10',
}
# 将post参数进行编码
data = urllib.parse.urlencode(data).encode('utf-8')

# 获取并返回request请求对象
request = urllib.request.Request(url=url, data=data, headers=headers)
return request


def get_content(request):
# 请求网页获取数据
response = urllib.request.urlopen(request)
# 将获取的数据进行读取并解码
content = response.read().decode('utf-8')
return content


def download(page,content):
# 下载数据到kfc文件中
with open('.\\kfc\\kfc_'+str(page)+'.json','w',encoding='utf-8') as f:
f.write(content)


if __name__ == '__main__':
print('肯德基区域店面抓取')
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
for page in range(start_page, end_page + 1):
# 创建request对象
requset = create_request(page)
# 获取响应数据
content = get_content(requset)
# 下载数据到本地
download(page,content)
print(content)

六、爬虫异常

6.1 导入模块

import urllib.error

6.2 HTTPError

路径错误 : 错误的url,解决报错异常

6.3 URLError

服务器或端口错误 : 错误的url,解决报错异常

6.4 捕获异常

try:
...
except urllib.error.HTTPError:
...

6.5 案例

# _*_ coding : utf-8 _*_
# @Time : 2023/6/30 7:43
# @Author : bamboo
# @File : urllib_except_csdn
# @Project : py-pro

import urllib.request
import urllib.error

# 正确的url
# url = 'https://blog.csdn.net/csdnnews/article/details/131427781'
# 路径错误 : 错误的url,解决报错异常
# url = 'https://blog.csdn.net/csdnnews/article/details/1314277811'
# 服务器或端口错误 : 错误的url,解决报错异常
url = 'http://www.goudan111.com'

headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

try:
request = urllib.request.Request(url=url,headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

print(content)
# 路径错误
except urllib.error.HTTPError:
# 实际上报错,掩盖意图
print('系统正在升级...')

# 主机地址或地址出错
except urllib.error.URLError:
print('第二种升级方案...')

七、Cookie登录

绕过页面登录抓取数据

场景分析:

# _*_ coding : utf-8 _*_
# @Time : 2023/6/30 8:07
# @Author : bamboo
# @File : urllib_cookie_login_weibo
# @Project : py-pro

import urllib.request

url = 'https://weibo.com/u/7074461820'

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

request = urllib.request.Request(url=url,headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf8')

print(content)
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xca in position 339: invalid continuation byte

报错原因:此用户页面是utf-8编码,但未登录会跳转到登录页,登录页的编码不是utf-8

7.1 错误更正

修改response接收的编码格式

content = response.read().decode('gb2312')

异常:发现获取到的数据是登录页面的

访问失败原因:请求头信息不够,无法正确访问

7.2 正确修改

添加Cookie信息

Cookie:
SINAGLOBAL=6664999447883.047.1685427483001; ULV=1685427483097:1:1:1:6664999447883.047.1685427483001:; XSRF-TOKEN=fixtHOwhdlfU4ShqjmpCjMAz; SUB=_2A25JmmvSDeRhGeFO7FYV9i_EyTyIHXVq7toarDV8PUNbmtANLWzDkW9NQUR46RjEnGZiDiVKFwdERkwUGCyDr3pR; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5lDnHA1MVrHu_3_ljBSdRy5JpX5KzhUgL.FoM7S0BXSo2Reo52dJLoI7f2MJ8rdc4kBJ8_qJva; ALF=1719619330; SSOLoginState=1688083330; WBPSESS=VUh1dQeaNE_DJ6H7aSNQDBgox7Vik0e5-Iwx3WN-nk01w7SoOcEhzwg5oULcHibaNwgdibVmopSq389wc5bSyv-8bcP8qSHgm98oqg_e-RMoiAj25rHZBCRDNbcLkDBRGlJDSuD9Rhv9kaNSE-zdpw==

代码

# _*_ coding : utf-8 _*_
# @Time : 2023/6/30 8:07
# @Author : bamboo
# @File : urllib_cookie_login_weibo
# @Project : py-pro

import urllib.request

url = 'https://weibo.com/u/7074461820'

headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Cookie':
'SINAGLOBAL=6664999447883.047.1685427483001; ULV=1685427483097:1:1:1:6664999447883.047.1685427483001:; XSRF-TOKEN=fixtHOwhdlfU4ShqjmpCjMAz; SUB=_2A25JmmvSDeRhGeFO7FYV9i_EyTyIHXVq7toarDV8PUNbmtANLWzDkW9NQUR46RjEnGZiDiVKFwdERkwUGCyDr3pR; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5lDnHA1MVrHu_3_ljBSdRy5JpX5KzhUgL.FoM7S0BXSo2Reo52dJLoI7f2MJ8rdc4kBJ8_qJva; ALF=1719619330; SSOLoginState=1688083330; WBPSESS=VUh1dQeaNE_DJ6H7aSNQDBgox7Vik0e5-Iwx3WN-nk01w7SoOcEhzwg5oULcHibaNwgdibVmopSq389wc5bSyv-8bcP8qSHgm98oqg_e-RMoiAj25rHZBCRDNbcLkDBRGlJDSuD9Rhv9kaNSE-zdpw=='
}

request = urllib.request.Request(url=url, headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

print(content)

原因:cookie中携带了登录信息,有了登录之后的cookie才能携带cookie进入到任何页面

八、handler处理器

urllib.request.urlopen(url) : 不能定制请求头

urllib.request.Request(url,headers,data) : 可以定制请求头

handler : 定制更高级的请求头

  • 动态cookie
  • 代理

使用

  1. 获取handler对象 handler = urllib.request.HTTPHandler()
  2. 获取opener对象 opener = urllib.request.build_opener(handler)
  3. 调用open对象 response = opener.open(request)

案例:

# _*_ coding : utf-8 _*_
# @Time : 2023/7/1 12:59
# @Author : bamboo
# @File : urllib_handler
# @Project : py-pro

import urllib.request

url = 'http://www.baidu.com'

headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
}

request = urllib.request.Request(url=url, headers=headers)

# 创建handler
# 1.获取handler对象
handler = urllib.request.HTTPHandler()
# 2.获取opener对象
opener = urllib.request.build_opener(handler)
# 3.调用open对象
response = opener.open(request)

content = response.read().decode('utf-8')

print(content)

九、代理

  1. 突破自身IP访问显示,访问外网
  2. 访问一些单位或团体内部资源
  3. 提高访问速度
  4. 隐藏真实IP,免受攻击

步骤

  1. request = urllib.request.Request(url=url, headers=headers)

  2. 设置proxies字典

    proxies = {
    'http': '182.139.110.14:9000'
    }
  3. 使用处理器 handler = urllib.request.ProxyHandler(proxies=proxies)

  4. opener = urllib.request.build_opener(handler)

  5. response = opener.open(request)

案例

# _*_ coding : utf-8 _*_
# @Time : 2023/7/1 14:00
# @Author : bamboo
# @File : urllib_proxy
# @Project : py-pro

import urllib.request

url = 'http://www.baidu.com/s?wd=ip'

headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Cookie':
'BIDUPSID=06131A5CB3B3012812267B71938D92FF; PSTM=1679290628; BDUSS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BDUSS_BFESS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BAIDUID=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; BD_HOME=1; H_PS_PSSID=36553_38857_38795_38958_38955_38832_38920_38806_38989_26350; BD_UPN=12314753; BA_HECTOR=ah018k040l8g018h80a18g8b1i9vg631o; BAIDUID_BFESS=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; ZFY=BXOBRbhZzq3:AgQrE2UdwmoKEVxRJv4XVrrWQOLZv9lE:C; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1; PSINO=5; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; baikeVisitId=a33ef4a5-bd67-4b3a-ac66-6f07ca3fee30; B64_BOT=1; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=74e2BAm6baEmykvFdTDJUKuT698Uocc37KZSVsBfg2KJ3c0eUvWu9sHBIXU',
# 'Referer':
# 'http://www.baidu.com/s?wd=ip'
}

# 代理ip
proxies = {
'http': '182.139.110.14:9000'
}
request = urllib.request.Request(url=url, headers=headers)

handler = urllib.request.ProxyHandler(proxies=proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)


content = response.read().decode('utf-8')

with open('proxy/proxy01.html', 'w', encoding='utf-8') as fp:
fp.write(content)

9.1 代理池创建

import urllib.request
import random

# 代理池创建
proxies_pool = [
{
'http': '182.139.110.14:9000'
},
{
'http': '182.139.110.14:9001'
},
{
'http': '182.139.110.14:9002'
}, {
'http': '182.139.110.14:9003'
}, {
'http': '182.139.110.14:9004'
}, {
'http': '182.139.110.14:9005'
}
]
# 随机代理
proxies = random.choice(proxies_pool)
# print(proxies)

url = 'http://www.baidu.com/s?wd=ip'

headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Cookie':
'BIDUPSID=06131A5CB3B3012812267B71938D92FF; PSTM=1679290628; BDUSS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BDUSS_BFESS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BAIDUID=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; BD_HOME=1; H_PS_PSSID=36553_38857_38795_38958_38955_38832_38920_38806_38989_26350; BD_UPN=12314753; BA_HECTOR=ah018k040l8g018h80a18g8b1i9vg631o; BAIDUID_BFESS=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; ZFY=BXOBRbhZzq3:AgQrE2UdwmoKEVxRJv4XVrrWQOLZv9lE:C; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1; PSINO=5; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; baikeVisitId=a33ef4a5-bd67-4b3a-ac66-6f07ca3fee30; B64_BOT=1; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=74e2BAm6baEmykvFdTDJUKuT698Uocc37KZSVsBfg2KJ3c0eUvWu9sHBIXU',
}

request = urllib.request.Request(url=url,headers=headers)

handler = urllib.request.ProxyHandler(proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)

content = response.read().decode('utf-8')

十、获取网页部分源码手段

10.1 xpath

  1. 浏览器添加XPath Helper.crx插件
    • 浏览器不支持crx文件时直接修改后缀为zip重新拖拽
    • 使用方式:网页ctrl+shift+x出现小黑框
    • 此插件用于在网页写xpath语法来验证自己需要获取的数据是否正确
  2. 安装lxml库
  3. 导入etree包
    • from lxml import etree
  4. xpath解析
    • 本地文件 html_tree = etree.parse(‘xxx.html’)
    • 服务器响应数据html_tree = etree.HTML(response.read().decode(‘utf-8’))
  5. 获取数据
    • html_tree.xpath(xpath路径)
  6. xpath基本语法
    1. 路径查询
      • // : 查找所有子孙节点,不考虑层级关系
      • / : 找直接子节点
    2. 谓语查询
      • //div[@id]
      • //div[@id=”maincontent”]
    3. 属性查询 /@class 也可以是别的属性
      • //@class
    4. 模糊查询
      • //div[contains(@id,”he”)]
      • //div[starts-with(@id,”he”)]
    5. 内容查询 /text()
      • 这个text()可以获取html标签中的innerText数据
      • //div/h1/text()
    6. 逻辑运算
      • //div[@id=”head” and @class=”s_down”]
      • title | //price

10.1.1 xpath语法使用

  1. 获取该网页下class为newsbody下的class为nbody的文本

    nbodys = tree.xpath('//div[@class="newsbody"]//div[@class="nbodys"]/text()')
  2. 获取class

    nbodys_class = tree.xpath('//div[@class="newsbody"]//div[@class="nbodys"]/@class')

10.1.2 示例

# _*_ coding : utf-8 _*_
# @Time : 2023/7/1 16:32
# @Author : bamboo
# @File : use
# @Project : py-pro

import urllib.request
from lxml import etree

url = 'https://www.bbiquge.net/book/133303/'
page = 'index_1.html'

headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
# 'Cookie':
# 'BIDUPSID=06131A5CB3B3012812267B71938D92FF; PSTM=1679290628; BDUSS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BDUSS_BFESS=XFtamFsanNsZWJWTFdoMUhTSXNKWXJxMnNGeVVqblIxTnVUd05zNlc3S0NrbnBrRVFBQUFBJCQAAAAAAAAAAAEAAADiwVCv0uCzvl~csgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIIFU2SCBVNkM; BAIDUID=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; BD_HOME=1; H_PS_PSSID=36553_38857_38795_38958_38955_38832_38920_38806_38989_26350; BD_UPN=12314753; BA_HECTOR=ah018k040l8g018h80a18g8b1i9vg631o; BAIDUID_BFESS=4E8EC46552D9E18C72A862024180E013:SL=0:NR=10:FG=1; ZFY=BXOBRbhZzq3:AgQrE2UdwmoKEVxRJv4XVrrWQOLZv9lE:C; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1; PSINO=5; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; baikeVisitId=a33ef4a5-bd67-4b3a-ac66-6f07ca3fee30; B64_BOT=1; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=74e2BAm6baEmykvFdTDJUKuT698Uocc37KZSVsBfg2KJ3c0eUvWu9sHBIXU',
}

request = urllib.request.Request(url=url+page,headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('gbk')

tree = etree.HTML(content)
print(tree)
hrefs = tree.xpath('//div[@class="zjbox"]/dl[@class="zjlist"]/dd/a/@href')

urls = []

for i in range(len(hrefs)):
urls.append(url+'/'+hrefs[i])

for url in urls:
print(url)

10.1.3 大型xpath抓取下载至本地

# _*_ coding : utf-8 _*_
# @Time : 2023/7/1 21:06
# @Author : bamboo
# @File : urllib
# @Project : py-pro
import urllib.request
from lxml import etree

def create_request(page):
if page == 1:
url = 'https://sc.chinaz.com/tupian/meinvxiezhen.html'
else:
url = 'https://sc.chinaz.com/tupian/meinvxiezhen_' + str(page) + '.html'
# 设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
# 请求对象定制
request = urllib.request.Request(url=url,headers=headers)
return request

def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content

def down_load(content):
# 下载图片
# urllib.request.urlretrieve('图片地址','文件名')
tree = etree.HTML(content)
name_list = tree.xpath('//div[@class="container"]//div/img/@alt')
src_list = tree.xpath('//div[@class="container"]//div/img/@data-original')

for i in range(len(name_list)):
name = name_list[i]
src = src_list[i]
url = 'http:'+src
urllib.request.urlretrieve(url=url,filename='img/'+name+'.jpg')

if __name__ == '__main__':
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))

for page in range(start_page, end_page + 1):
# 请求对象定制
request = create_request(page)
# 获取网页源码
content = get_content(request)
# 下载
down_load(content)
print('第'+str(page)+'下载完成')

10.2 JsonPath

  1. pip安装

    • pip install jsonpath
  2. 使用说明(获取json对象)

    1. 本地json文件获取对象
    • obj = json.load(open(‘json文件’,’r’,encoding=’utf-8’))
    • import json
    • 如果是网络资源的json保存到本地再使用
    obj = json.load(open('jsonpath.json','r',encoding='utf-8'))
    1. 使用jsonpath
    • ret = jsonpath.jsonpath(obj,’jsonpath语法’)
    • import jsonpath

案例:获取淘票票所有城市

# _*_ coding : utf-8 _*_
# @Time : 2023/7/2 15:18
# @Author : bamboo
# @File : urllib_jsonpath_taopiaopiao
# @Project : py-pro

import urllib.request
import json
import jsonpath

url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1688282177270_108&jsoncallback=jsonp109&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'

headers = {
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
# 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Bx-V': '2.5.0',
'Cookie': 'cna=6zKQHGg562ACATswb40Y5qgs; t=62bc0d82720dc60b137ad4f1304cd5ee; xlly_s=1; cookie2=13f702bb8b29a0ea7a1f19121af7e5b6; v=0; _tb_token_=5e8d43fe773f; tfstk=dDSp3DYxg5VhsMe8xBUg4DXIxpwgoMBUK6WjqQAnP1COg6eer2bhybdOhJR3zy5RW_1w-BsRU0p5F_yeZJzGL9-yVSVc2oXFLWTiX3BS59nVR3N0io0iCePHVg2TeLIMBfn_gPofcgvQPLKrgYK4FpYpGOUqq0oHCjAAd4ijDGQ9JOyAmiER2jAvE2wTB4uyRdPvO88C.; l=fBO47mseN_qPmZjaBO5CFurza77OmQRb8sPzaNbMiIEGa1mctFsHZNC1c_XMSdtjgTfURexyVhmX9dEplCUd_giMW_N-1NKcFYJ6-bpU-L5..; isg=BOjoROdYGCr4czQ8JTX6KMlNudb6EUwbtnsuNKIbqWNW_YlnSCcPq1Bz9ZUNTQTz',
'Referer': 'https://dianying.taobao.com/',
'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}

request = urllib.request.Request(url=url, headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')
result = content.split('(')[1][:-2]
# print(result)

# 存json文件
# with open('taopiaopiao.json','w',encoding='utf-8') as fp:
# fp.write(result)

obj = json.load(open('taopiaopiao.json', 'r', encoding='utf-8'))
region_names = jsonpath.jsonpath(obj,'$..regionName')
print(region_names)

10.3 BeautifulSoup

缺点:没有lxml效率高

优点:使用方便

  1. 安装 pip install bs4

  2. 导入 from bs4 import BeautifulSoup

  3. 创建对象

    • 服务器响应文件
    soup = BeautifulSoup(response.read().decode(), 'lxml')
    • 本地文件
    soup = BeautifulSoup(open('1.html'), 'lxml')
    # 注:默认打开文件编码gbk,需要指定打开编码格式

10.3.1 节点定位

  1. 根据标签名查找节点

    • soup.a
      • 只能找到第一个a,后面加属性名可以获取它的值
      • soup.a.name
      • soup.a.attrs
  2. 函数

    1. .find(返回一个对象)

      • find(‘a’) : 只找到第一个a标签
      • find(‘a’, title=’名字’)
      • find(‘a’, class_=’名字’)
        • class属性只能写成class_
    2. .find_all(返回一个列表)

      • find_all(‘a’) : 查到所有的a
      • find_all([‘a’, ‘span’]) : 返回所有的a和span
      • fnd_all(‘a’, limit=2) : 只查找前两个a
    3. .select(根据选择器得到节点对象) 【推荐】

      1. element

        • eg:p
        • 返回的是一个列表,并且会返回多个数据
        soup.select('a')
      2. .class

        • eg:.firstname
        soup.select('.a1')
      3. #id

        • eg:#firstname
        soup.select('#l1')
      4. 属性选择器

        • 查找li中有id属性的标签
        soup.select('li[id]')
        • 查找li中id=l2的标签
        soup.select('li[id="l2"]')
      5. 层级选择器

        • 后代 element element
        • 子代 element>element
        • 同级 element,element

10.3.2 节点信息

  1. 获取节点内容

    1. obj.string
    2. obj.get_text() 【推荐】
  2. 节点的属性

    1. tag.name 获取标签名

      eg:tag = find(‘li’)

      ​ print(tag.name)

    2. tag.attrs将属性值作为一个字典返回

  3. 获取节点属性

    1. obj.attrs.get(‘title’) 【常用】
    2. obj.get(‘title’)
    3. obj[‘title’]

十一、反扒手段统计

11.1 header反爬

验证浏览器信息

11.2 cookie反爬

验证登录信息

11.3 referer反爬

判断是否由某网站跳转,一般情况做图片防盗链