标签里所有的内容
tel_page = re.findall(r'
(.*?)
',response1.text, re.S)
tel_str = tel_page[0]
# 在用re匹配到详情页面的手机号
tel = re.findall(r'13[0-9]d{8}|14[5,7]d{8}|^:15[0-3,5-9]d{8}|17[0,3,5-8]d{8}|18[0-9]d{8}|166d{8}|198d{8}|199d{8}|147d{8}',tel_str)
html_tree1 = etree.HTML(response1.content)
# 用xpath拿到商家名称
biaoti = html_tree1.xpath('//*[@id="dt_title"]/text()')
# 用xpath拿到发布者所在的链接,这是新加的上面没讲
fabuzhe=html_tree1.xpath('//*[@id="subinfo_name"]/@href')
# print(fabuzhe)
print(tel)
print(biaoti[1].strip())
pub=re.search(r'/timeline/(.*?).html', fabuzhe[0])
publishUserId36=pub.group(1)
print(publishUserId36)
# 将发布者链接发送post请求,拿到json数据
url1 = 'http://api.hdb.com/ajax/api:4009?'
data = {
'queryType': 2,
'publishUserId36': publishUserId36,
}
response2=requests.post(url=url1,data=data,headers=headers)
fabuzhe_xiangxi = response2.json()
# 解析json数据,拿到发布者信息
fabuzhe_xiangxi1 = fabuzhe_xiangxi['result']['shopName']
最后来一个整体代码:
import requests
from urllib.parse import quote
import random
import re
from lxml import etree
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
]
headers = {'User-Agent': random.choice(user_agent)}
dict_objs = ['绘画','主持人']
for dict_obj in dict_objs:
encode_after = quote(dict_obj)
for i in range(1, 80):
data={
'word':encode_after,
'area_code': 'quanguo',
'page_num': str(i),
}
url = "http://www.hdb.com/info_search?"
response = requests.get(url=url, headers=headers, params=data)
print(response.url)
response = response.content
html_tree = etree.HTML(response)
ret = html_tree.xpath('/html/body/div[2]/div/ul/li/div/h3/a/@href')
for i in ret:
response1 = requests.get(url=i, headers=headers)
try:
tel_page = re.findall(r'(.*?)
',response1.text, re.S)
tel_str = tel_page[0]
tel = re.findall(r'13[0-9]d{8}|14[5,7]d{8}|^:15[0-3,5-9]d{8}|17[0,3,5-8]d{8}|18[0-9]d{8}|166d{8}|198d{8}|199d{8}|147d{8}',tel_str)
html_tree1 = etree.HTML(response1.content)
biaoti = html_tree1.xpath('//*[@id="dt_title"]/text()')
fabuzhe=html_tree1.xpath('//*[@id="subinfo_name"]/@href')
# print(fabuzhe)
print(tel)
print(biaoti[1].strip())
pub=re.search(r'/timeline/(.*?).html', fabuzhe[0])
publishUserId36=pub.group(1)
print(publishUserId36)
url1 = 'http://api.hdb.com/ajax/api:4009?'
data = {
'queryType': 2,
'publishUserId36': publishUserId36,
}
response2=requests.post(url=url1,data=data,headers=headers)
fabuzhe_xiangxi = response2.json()
fabuzhe_xiangxi1 = fabuzhe_xiangxi['result']['shopName']
喜欢就点个赞吧