1. spider代码:这里注意找title和star,以及pic时xpath不同。前两者是在info下,后者是在pic下。for循环中按item寻找,每次找到一个item(电影)的title、star和图片信息,每次调用一次yield生成器,在pipeline里面进行处理。在item找完后,找下一个page的链接,再调用parse进行解析
# -*- coding: utf-8 -*-
import scrapy
from douban.items
import DoubanItem
class Douban250Spider(scrapy.Spider):
name =
'douban250'
# allowed_domains = ['https://movie.douban.com/']
start_urls = [
'https://movie.douban.com/top250']
def parse(
self, response):
for sel
in response.xpath(
'//div[@class="item"]'):
item = DoubanItem()
item[
'title'] = sel.xpath(
'div[@class="info"]/div[@class="hd"]/a/span/text()').extract()[
0]
item[
'star'] = sel.xpath(
'div[@class="info"]/div[@class="bd"]/div[@class="star"]
/span[@class="rating_num"]/text()').extract()[
0]
item[
'image_urls'] = sel.xpath(
'div[@class="pic"]/a/img/@src').extract()
yield item
nextPage = sel.xpath(
'//div[@class="paginator"]/
span[@class="next"]/a/@href').extract()[
0].strip()
if nextPage:
next_url =
'https://movie.douban.com/top250'+nextPage
yield scrapy.http.Request(next_url,
callback=
self.parse,
dont_filter=
True)
2. settings文件:指定pipeline。这里有处理文字和图片两个pipeline,设置随机代理:
# -*- coding: utf-8 -*-
# Scrapy settings for douban project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
BOT_NAME =
'douban'
SPIDER_MODULES = [
'douban.spiders']
NEWSPIDER_MODULE =
'douban.spiders'
FEED_EXPORT_ENCODING =
'utf-8'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UA = random.choice(user_agent_list)
USER_AGENT = UA
# Obey robots.txt rules
ROBOTSTXT_OBEY =
False
IMAGES_STORE =
'D:\python project\douban\images'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'douban.middlewares.DoubanSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'douban.middlewares.DoubanDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline':
100,
'douban.pipelines.SaveNameScore':
200,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
3. pipeline文件:设置文字和图片两个pipeline。对于图片pipeline,引入ImagePipeline通道,重写get_media_requests函数,将图片的url生产request请求。重写file_path函数,将每个电影名称和评分设为文件名。注意ImagePlieline需要PIL库支持,下载Pillow
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sys
import random
from scrapy.http
import Request
from scrapy.contrib.pipeline.images
import ImagesPipeline
from scrapy.exceptions
import DropItem
reload(sys)
sys.setdefaultencoding(
'utf8')
class DoubanPipeline(ImagesPipeline):
def get_media_requests(
self,item,info):
for image_url
in item[
'image_urls']:
yield Request(
url=image_url ,
meta={
'item':item})
def file_path(
self,request,response=
None,info=
None):
item=request.meta[
'item']
#通过上面的meta传递过来item +str(random.random())
#图片文件名,item['carname'][index]得到汽车名称,request.url.split('/')[-1].split('.')[-1]得到图片后缀jpg,png
image_guid = item[
'title']+
'_'+item[
'star']+
'.'+request.url.split(
'/')[-
1].split(
'.')[-
1]
#图片下载目录 此处item['country']即需要前面item['country']=''.join()......,否则目录名会变成u97e9u56fdu6c7du8f66u6807u5fd7xxx.jpg
filename =
u'full/{0}'.format(image_guid)
return filename
def item_completed(
self,results,item,info):
image_paths = [x[
'path']
for ok, x
in results
if ok]
if not image_paths:
raise DropItem(
"Item contains no images")
return item对于名称和评分pipeline,写入文件中:
class SaveNameScore(
object):
def __init__(
self):
self.file=
open(
'douban_top250.txt',
mode=
'wb')
def process_item(
self, item,
spider):
line =
'The top250 movie list:'
title = item[
'title']
star = item[
'star']
line = line +
' ' + title +
' '
line = line + star +
'
'
self.file.write(line)
def close_spider(
self,
spider):
self.file.close()items文件这样写:
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
star=scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pass效果:
代码获取目录:https://github.com/xzxin/douban_scrapy