进程池+share memory 爬取131mm网站美女图片

2019-04-15 16:37发布

代码很粗糙,娱乐玩玩就好 import time import os from lxml import etree import requests import os import random import multiprocessing def download(a, b, url1, headers, n,items2_list): u='http://www.mm131.com/xinggan/'+b headers1 = { 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'img1.mm131.me', 'Referer': u, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' } print(u) req3 = requests.get(url1 + a, headers=headers) url_f = etree.HTML(req3.content).xpath("//div/div/a/img/@src") url_str = ''.join(url_f) with open("F:\娱乐\爬图片\131mm\xgmm2\" + "xgmm" + str(n) + ".jpg", "wb") as f: print("保存第{}张,还剩{}张".format(n,len(items2_list))) f.write((requests.get(url_str, headers=headers1)).content) items2_list.remove(a) def htmlparser2(i,headers,items2_list,items_list): resp2 = requests.get(i, headers=headers) resp2.encoding = 'utf8' root2 = etree.HTML(resp2.content) items2 = root2.xpath('//div[@class="content-page"]/a/@href') items2 = list(set(items2)) # global items2_list items2_list.extend(items2) # global items_list items_list.remove(i) # 解析一条就会删除一条 # print(items2_list) print("这一系列的数量:", len(items2)) def htmlparser(i,headers,items_list): url = "http://www.mm131.com/xinggan/list_6_{}.html".format(str(i)) req = requests.session().get(url, headers=headers) # print(req.status_code) root = etree.HTML(req.content) # items = root.xpath('//dl[@class="list-left public-box"]/dd/a[@target="_blank" ]/@href') items = root.xpath('//dl[@class="list-left public-box"]/dd/a[@target="_blank" ]/@href') print("第{}页".format(i)) items_list.extend(items) if __name__ == '__main__': url1 = "http://www.mm131.com/xinggan/" headers = { 'User-Agent': 'Baiduspider+(+http://www.baidu.com/search/spider.html")', 'Referer': 'https://www.baidu.com/link?url=mPARC6e0QgmXiBEX1UCXo62Hsl1XIxYOsAVJUsS9R_SumSXwtLn3_XcPCIxWUC7U&wd=&eqid=af85b81300074c38000000025ae83304', 'Cookie': 'UM_distinctid=162299332741c7-07936edc87635f-7b113d-100200-162299332753a2; bdshare_firstime=1521115935504; CNZZDATA3866066=cnzz_eid%3D306650475-1494676185-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1494676185; Hm_lvt_9a737a8572f89206db6e9c301695b55a=1525162468,1525165327,1525165615,1525165652; Hm_lpvt_9a737a8572f89206db6e9c301695b55a=1525165916', 'Upgrade - Insecure - Requests': '1', 'Host': 'www.mm131.com', 'Connection': 'keep - alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept - Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9' } n = 1 # items_list=urldata.items_list items_list=multiprocessing.Manager().list() pool = multiprocessing.Pool(40) for i in range(2, 140): pool.apply_async(htmlparser, (i,headers,items_list)) pool.close() pool.join() pool1 = multiprocessing.Pool(40) pool2 = multiprocessing.Pool(40) # 开启40个进程下载 # 多进程 print(len(items_list)) items2_list = multiprocessing.Manager().list() for i in items_list: print("执行第三阶段") print(i) pool1.apply_async(htmlparser2, (i,headers,items2_list,items_list)) pool1.close() pool1.join() print(items2_list) for a in items2_list: print("***************************执行下载*****************************") b=a[:4]+a[-5:] print(b) n += 1 pool2.apply_async(download, (a, b, url1, headers, n,items2_list)) pool2.close() pool2.join() print("over")