data/attach/1904/6l4cjivetoo0vvbxvif9lm47gz2epy0i.jpgdata/attach/1904/rp57iueucw4ediw3hahm1hlmi08kuf89.jpg
03.01_爬虫(cookie登录)
使用cookie登录126邮箱
import urllib.request
import urllib.parse
url = "http://mail.163.com/js6/s?sid=qADLcluZhoFKNKVOskZZDrECKQRTRnTD&func=mbox:listMessages&welcome_welcomemodule_yxRecomDwon_click=1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1&LeftNavfolder1Click=1&mbox_folder_enter=1"
headers = {
"Host": "mail.126.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36",
"Content-type": "application/x-www-form-urlencoded",
"Referer": "https://mail.126.com/js6/main.jsp?sid=EATdzbMHZJyLYLvWPrHHRwLXPKeAuLfJ",
"Cookie": "mail_health_check_time=1543370574870; NTES_SESS=Ph.2pQsx4GhOnHdn3nCRvMDRn_YMtseF2OL6wjSR0WyitMe.tG1Vzrs_kWiGrrYLfh3uKegpscHmChF.15E5KpmtCE7sy_1Adk3y0yc..krzqiobRiV8oNk2WbCA6LmWEbQdrWcvYwmTGy.An2Fs_pgzyCxSVTSuCszERWomv1YDwN7J3SYP2NqicTSIU.wF5YbPkOHyI_Nsi3g_3BbihHQnpGYv6Ens_; ANTICSRF=ea921eca74a0fbb777c08059be3e30b4; S_INFO=1543370570|0|#1&85#|dushine@126.com; P_INFO=dushine@126.com|1543370570|0|other|11&17|zhj&1543215952&mail126#zhj&330100#10#0#0|189800&0|mail126|dushine@126.com; mail_upx=t11hz.mail.126.com|t12hz.mail.126.com|t13hz.mail.126.com|t1hz.mail.126.com|t2hz.mail.126.com|t3hz.mail.126.com|t4hz.mail.126.com|t5hz.mail.126.com|t6hz.mail.126.com|t7hz.mail.126.com|t8hz.mail.126.com|t10hz.mail.126.com|c1bj.mail.126.com|c2bj.mail.126.com|c3bj.mail.126.com|c4bj.mail.126.com|c5bj.mail.126.com|c6bj.mail.126.com|c7bj.mail.126.com; mail_upx_nf=; mail_idc=; Coremail=1e4f4c89d6020%EATdzbMHZJyLYLvWPrHHRwLXPKeAuLfJ%g1a4.mail.126.com; MAIL_MISC=dushine@126.com; cm_last_info=dT1kdXNoaW5lJTQwMTI2LmNvbSZkPWh0dHAlM0ElMkYlMkZtYWlsLjEyNi5jb20lMkZqczYlMkZtYWluLmpzcCUzRnNpZCUzREVBVGR6Yk1IWkp5TFlMdldQckhIUndMWFBLZUF1TGZKJnM9RUFUZHpiTUhaSnlMWUx2V1BySEhSd0xYUEtlQXVMZkomaD1odHRwJTNBJTJGJTJGbWFpbC4xMjYuY29tJTJGanM2JTJGbWFpbi5qc3AlM0ZzaWQlM0RFQVRkemJNSFpKeUxZTHZXUHJISFJ3TFhQS2VBdUxmSiZ3PW1haWwuMTI2LmNvbSZsPS0xJnQ9LTE=; MAIL_SESS=Ph.2pQsx4GhOnHdn3nCRvMDRn_YMtseF2OL6wjSR0WyitMe.tG1Vzrs_kWiGrrYLfh3uKegpscHmChF.15E5KpmtCE7sy_1Adk3y0yc..krzqiobRiV8oNk2WbCA6LmWEbQdrWcvYwmTGy.An2Fs_pgzyCxSVTSuCszERWomv1YDwN7J3SYP2NqicTSIU.wF5YbPkOHyI_Nsi3g_3BbihHQnpGYv6Ens_; MAIL_SINFO=1543370570|0|#1&85#|dushine@126.com; MAIL_PINFO=dushine@126.com|1543370570|0|other|11&17|zhj&1543215952&mail126#zhj&330100#10#0#0|189800&0|mail126|dushine@126.com; secu_info=1; mail_entry_sess=46352de3f97c1b560da173f472dcb88380c236ecf542ecf7a4a97bdae2e7570d792b99432eb80b1f0c34396a7db37d5c85d42eeaae7f6158329febf4bc40f11c37a76a38c79d4869a42ff7b96a9cd033f7a1fe313dbb65cdc57ad778cd7c9f96f396864f0654070d3e9d5e092e8068b11b92205f0b99c9f6bf5bafcf185270b7db21e671f596e450bdadc615940656d539c1a0b3e26d9f66f28004c7852ce6acdacea90a36fed0c21740e2964b97467849a113b62c218076a64061cb5d4e5ce7; starttime=; _pk_ref.16.3540=%5B%22%22%2C%22%22%2C1543370648%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DTqg9NVriMY2VevVBdhNEvbncLSgvN_Byxptn9pEMybe%26wd%3D%26eqid%3De750872f00064f72000000025bfdf756%22%5D; _pk_id.16.3540=c0d95e69-c95a-454a-b5bf-722e5331b3dc.1543370648.1.1543370648.1543370648.; _pk_ses.16.3540=*; locale=; Coremail.sid=EATdzbMHZJyLYLvWPrHHRwLXPKeAuLfJ; mail_style=js6; mail_uid=dushine@126.com; mail_host=mail.126.com; JSESSIONID=A3900D3237A9BF4283B53FF379A9BEF9",
}
form_data = {
"sid": "EATdzbMHZJyLYLvWPrHHRwLXPKeAuLfJ",
"func": "mbox:listMessages",
"welcome_tips_phone_show": 1,
"LeftNavfolder1Click": 1,
"mbox_folder_enter ": 1,
}
form_data= urllib.parse.urlencode(form_data).encode()
request = urllib.request.Request(url=url,headers=headers)
使用cookie登录人人网
import urllib.request
import urllib.parse
url = "http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018103949939"
form_data = {
"email": "1328",
"icode": "",
"origURL": "http://www.renren.com/home",
"domain": "renren.com",
"key_id": "1",
"captcha_type": "web_login",
"password": "8208504fef0e593ad0",
"rkey": "39b3920f31f40",
"f": "http%3A%2F%2Fwww.renren.com%2F968904311%2Fprofile",
}
headers = {
"Host": "www.renren.com",
"Connection": "keep-alive",
"Content-Length": 291,
"Origin": "http://www.renren.com",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded",
}
request = urllib.request.Request(url=url,headers=headers)
form_data = urllib.parse.urlencode(form_data).encode()
response = urllib.request.urlopen(request,form_data)
print(response.read().decode())
Cookiejar:
# 创建一个cookiejar对象
cj = http.cookiejar.CookieJar()
# 通过cookiejar创建一个handler
handler = urllib.request.HTTPCookieProcessor(cj)
# 根据handler创建一个opener
opener = urllib.request.build_opener(handler)
再往下所有的操作都是用opener.open方法去发送请求,因为这里面带着cookie过去了
03.02_爬虫(正则表达式解析)
- 为什么引入正则表达式?
- 规则
- 单字符:
- . : 匹配任意字符
- [] : 括号中的任意一个字符
- d : 数字
- D : 非数字
- w : 数字、字母、下划线
- W : 非w
- s : 空格/tab
- S : 非s
- 数量修饰:
-
-
- ? : 一个或者0个
- {m} :指定数量
- {m,}:至少m个
- {m,n}:至少m个最多n个
- {,n} : 表示写错了
- 边界:
- 单词边界 B 非单词边界
- $ : 结尾
- ^ : 开头
- 分组:
- ab{3}: abbb
- (ab){3}:ababab
- (){4}
- () 1 2 和指定位置的内容匹配
- 贪婪模式
- .*? : 匹配至少0个字符
- .+? : 匹配至少一个
- re.I : 不区分大小写
- re.M :多行匹配
- re.S :单行匹配
- matchsearchfindall: 匹配单个/查找多个/匹配所有
- re.sub(正则表达式, 替换内容, 字符串) 替换指定内容
03.03_爬虫(图片下载)
"""
目标网站
http://www.qiushibaike.com
目标资源
下载某一个栏目下指定页码的图片
实现思路
1.确定url
2.伪装成浏览器
3.发送请求获取数据
4.解析数据,获取图片链接
5.下载图片到本地,url.request.retrieve(url, file_name)
"""
import os
import re
import urllib.request
import urllib.parse
# 获取request对象
def handle_request(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36",
}
request = urllib.request.Request(url=url, headers=headers)
return request
# 解析获取html内容
"""
.*?
.*/
"""
def parse_content(content):
# print(content)
pattern = re.compile(r'.*?
.*?
', re.S)
result = pattern.findall(content)
# print(result)
print(len(result))
return result
# 下载图片
def download_pic(image_list):
dir_name = "pic"
if not os.path.exists(dir_name):
os.mkdir(dir_name)
for img_src in image_list:
url_img = "http:" + img_src
print(url_img)
file_name = img_src.split("/")[-1]
file_name = dir_name + "/" + file_name
if file_name.endswith(".jpg"):
print("正在下载%s" % file_name)
# 发送请求,下载内容
urllib.request.urlretrieve(url_img, filename=file_name)
print("完成下载%s" % file_name)
def main():
# url = "https://www.qiushibaike.com/imgrank/page/"
page_count = int(input("需要几页图:"))
for page in range(1, page_count + 1):
url = "https://www.qiushibaike.com/imgrank/page/" + str(page) + "/"
print("正在链接第%d页..." % page)
# 获取request对象
request = handle_request(url)
# 发送请求,获取响应数据
content = urllib.request.urlopen(request).read().decode()
print("第%d页内容获取成功,正在解析...")
image_list = parse_content(content)
# 下载图片到本地
download_pic(image_list)
if __name__ == "__main__":
main()
03.04_爬虫(文章下载)
- http://www.yikexun.cn/
- 需求:
-
爬取指定页面的标题和内容
-
保存到html文件中,标题用h1,内容使用p即可
目标网站
目标资源
怕去网站某一栏目下的文章
代码思路
确定url
伪装
获取request对象
发起请求获取相应
解析相应内容,获取目标标签
下载标题指向的内容并写成html文件
import urllib.request
import urllib.parse
import os
import re
# 获取request对象
def handle_request(url):
headers = {
"Host": "www.yikexun.cn",
"Connection": "keep - alive",
"User - Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 70.0.3538.110 Safari / 537.36",
"Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
"Accept - Language": "zh - CN, zh;q = 0.9"
}
request = urllib.request.Request(url,headers=headers)
return request
def parse_content(content):
pattern = re.compile(r'',re.S)
result = pattern.findall(content)
print(result)
return result
# 下载页面文字内容
def get_text(content):
pattern = re.compile(r'(.*?)
',re.S)
text = pattern.findall(content)
return text[0]
def download_content(href_list):
for href_title in href_list:
title = href_title[1]
href_title = "http://www.yikexun.cn" +href_title[0]
print(href_title)
# 获取request对象
request = handle_request(href_title)
content = urllib.request.urlopen(request).read().decode()
text = get_text(content)
# 获取到的文章写入本地文件
file_name = "人生爱情哲理名言.html"
with open(file_name,mode="a",encoding="utf-8") as file_name:
file_name.write("%s
%s" % (title,text))
def main():
url = "http://www.yikexun.cn/t/1210/3661.html"
request = handle_request(url)
# 发起请求获取网页内容
content = urllib.request.urlopen(request).read().decode()
# 解析content,和获取文章的标题和链接
href_list = parse_content(content)
# 下载内容到本地
download_content(href_list)
if __name__=="__main__":
main()
03.05_爬虫(bs4–BeautifulSoup)
-
如果连接失败或者下载速度慢,需要将pip源设置为国内源,阿里源、豆瓣源、网易源等
-
windows
-
(1)打开文件资源管理器
-
(2)地址栏上面输入 %appdata%
-
(3)在这里面新建一个文件夹 pip
-
(4)在pip文件夹里面新建一个文件叫做 pip.ini ,内容写如下即可
[global]
timeout = 6000
index-url = https://mirrors.aliyun.com/pypi/simple/
trusted-host = mirrors.aliyun.com
-
linux
- (1)cd ~
- (2)mkdir ~/.pip
- (3)vi ~/.pip/pip.conf
- (4)编辑内容,和windows一模一样
-
需要安装:
pip install bs4
-
bs4在使用时候需要一个第三方库,把这个库也安装一下
pip install lxml
-
简单使用:
- 说明:选择器,jquery
- from bs4 import BeautifulSoup
- 使用方式:可以将一个html文档,转化为指定的对象,然后通过对象的方法或者属性去查找指定的内容
- (1)转化本地文件:
- soup = BeautifulSoup(open(‘本地文件’), ‘lxml’)
- (2)转化网络文件:
- soup = BeautifulSoup(‘字符串类型或者字节类型’, ‘lxml’)
- (1)根据标签名查找
- (2)获取属性
- soup.标签.attrs
- soup.标签.attrs[‘href’]
- soup.a[‘href’]
- (3)获取内容
- soup.标签.string
- soup.标签.text
- soup.标签.get_text()
- 如果标签还有标签,那么string获取到的结果为None,而其它两个,可以获取文本内容
- (4)find
- soup.find(‘a’)
- soup.find(‘a’, title=“xxx”)
- soup.find(‘a’, alt=“xxx”)
- soup.find(‘a’, class_=“xxx”)
- soup.find(‘a’, id=“xxx”)
- find方法不仅soup可以调用,普通的div对象也能调用,会去指定的div里面去查找符合要求的节点
- find找到的都是第一个符合要求的标签
- (5)find_all
- soup.find_all(‘a’)
- soup.find_all([‘a’, ‘b’])
- soup.find_all(‘a’, limit=2)
- (6)select
-
根据选择器选择指定的内容
-
常见的选择器:标签选择器、类选择器、id选择器、组合选择器、层级选择器、伪类选择器、属性选择器
-
a
-
.dudu
-
#lala
-
a, .dudu, #lala, .meme
-
div .dudu #lala .meme .xixi 下面好多级
-
div > p > a > .lala 只能是下面一级
-
input[name=‘lala’]
-
select选择器返回永远是列表,需要通过下标提取指定的对象,然后获取属性和节点
-
该方法也可以通过普通对象调用,找到都是这个对象下面符合要求的所有节点
实例代码:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open("new_file.html",encoding="utf-8"),"lxml")
print(soup)
print("*"*10)
获取标签
print(soup.a)
获取标签属性
print(soup.a.attrs)
获取标签中的某一个属性
print(soup.a.attrs["href"])
print(soup.a["href"])
获取内容
print(soup.a.text)
print(soup.a.string)
print(soup.a.get_text())
print("*"*30)
find-只匹配一个元素
print(soup.find())
print(soup.find("a",class_="zuihuayin"))
find_all
print(soup.find_all("a"))
print(soup.find_all("a",limit=2))
print(soup.find_all("a",class_="zuihuayin"))
可以查找多个标签的内容
print(soup.find_all(["a","h3"]))
select选择器
print("*"*30)
print(soup.select(".zuihuayin"))
print(soup.select("a"))
print(soup.select("#Li"))
print(soup.select("#Li")[0].text)
层级选择器
print(soup.select(".tang h3"))