scrapy-redis安装与使用

文档:

https://scrapy-redis.readthedocs.org.

安装scrapy-redis

之前已经装过scrapy了,这里直接装scrapy-redis

  1. pip install scrapy-redis

使用scrapy-redis的example来修改

先从github上拿到scrapy-redis的example,然后将里面的example-project目录移到指定的地址

  1. git clone https://github.com/rolando/scrapy-redis.git
  2. cp -r scrapy-redis/example-project ./scrapy-youyuan

或者将整个项目下载回来scrapy-redis-master.zip解压后

  1. cp -r scrapy-redis-master/example-project/ ./redis-youyuan
  2. cd redis-youyuan/

tree查看项目目录

scrapy-redis安装与使用 - 图1

修改settings.py

下面列举了修改后的配置文件中与scrapy-redis有关的部分,middleware、proxy等内容在此就省略了。

https://scrapy-redis.readthedocs.io/en/stable/readme.html

注意:settings里面的中文注释会报错,换成英文

指定使用scrapy-redis的SchedulerSCHEDULER = "scrapy_redis.scheduler.Scheduler"# 在redis中保持scrapy-redis用到的各个队列,从而允许暂停和暂停后恢复SCHEDULER_PERSIST = True# 指定排序爬取地址时使用的队列,默认是按照优先级排序SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'# 可选的先进先出排序# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue'# 可选的后进先出排序# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack'# 只在使用SpiderQueue或者SpiderStack是有效的参数,,指定爬虫关闭的最大空闲时间SCHEDULER_IDLE_BEFORE_CLOSE = 10# 指定RedisPipeline用以在redis中保存itemITEM_PIPELINES = { 'example.pipelines.ExamplePipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 400}# 指定redis的连接参数# REDIS_PASS是我自己加上的redis连接密码,需要简单修改scrapy-redis的源代码以支持使用密码连接redisREDIS_HOST = '127.0.0.1'REDIS_PORT = 6379# Custom redis client parameters (i.e.: socket timeout, etc.)REDIS_PARAMS = {}#REDIS_URL = 'redis://user:pass@hostname:9001'#REDIS_PARAMS['password'] = 'itcast.cn'LOG_LEVEL = 'DEBUG'DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'#The class used to detect and filter duplicate requests.#The default (RFPDupeFilter) filters based on request fingerprint using the scrapy.utils.request.request_fingerprint function. In order to change the way duplicates are checked you could subclass RFPDupeFilter and override its request_fingerprint method. This method should accept scrapy Request object and return its fingerprint (a string).#By default, RFPDupeFilter only logs the first duplicate request. Setting DUPEFILTER_DEBUG to True will make it log all duplicate requests.DUPEFILTER_DEBUG =True# Override the default request headers:DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch',}

查看pipeline.py

from datetime import datetimeclass ExamplePipeline(object): def process_item(self, item, spider): item["crawled"] = datetime.utcnow() item["spider"] = spider.name return item

项目案例

以抓取有缘网 北京 18-25岁 女朋友为例

修改items.py

增加我们最后要保存的Profile项

  1. class Profile(Item):
  2. # 提取头像地址
  3. header_url = Field()
  4. # 提取相册图片地址
  5. pic_urls = Field()
  6. username = Field()
  7. # 提取内心独白
  8. monologue = Field()
  9. age = Field()
  10. # youyuan
  11. source = Field()
  12. source_url = Field()
  13. crawled = Field()
  14. spider = Field()

修改爬虫文件

在spiders目录下增加youyuan.py文件编写我们的爬虫,之后就可以运行爬虫了。这里的提供一个简单的版本:

  1. # -*- coding: utf-8 -*-
  2. from scrapy.linkextractors import LinkExtractor
  3. from example.items import Profile
  4. import re
  5. from scrapy.dupefilters import RFPDupeFilter
  6. from scrapy.spiders import CrawlSpider,Rule
  7. class YouyuanSpider(CrawlSpider):
  8. name = 'youyuan'
  9. allowed_domains = ['youyuan.com']
  10. # 有缘网的列表页
  11. start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
  12. pattern = re.compile(r'[0-9]')
  13. # 提取列表页和Profile资料页的链接形成新的request保存到redis中等待调度
  14. profile_page_lx = LinkExtractor(allow=('http://www.youyuan.com/\d+-profile/'),)
  15. page_lx = LinkExtractor(allow =(r'http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/'))
  16. rules = (
  17. Rule(page_lx, callback='parse_list_page', follow=True),
  18. Rule(profile_page_lx, callback='parse_profile_page', follow=False),
  19. )
  20. # 处理列表页,其实完全不用的,就是留个函数debug方便
  21. def parse_list_page(self, response):
  22. print "Processed list %s" % (response.url,)
  23. #print response.body
  24. self.profile_page_lx.extract_links(response)
  25. pass
  26. # 处理Profile资料页,得到我们要的Profile
  27. def parse_profile_page(self, response):
  28. print "Processing profile %s" % response.url
  29. profile = Profile()
  30. profile['header_url'] = self.get_header_url(response)
  31. profile['username'] = self.get_username(response)
  32. profile['monologue'] = self.get_monologue(response)
  33. profile['pic_urls'] = self.get_pic_urls(response)
  34. profile['age'] = self.get_age(response)
  35. profile['source'] = 'youyuan'
  36. profile['source_url'] = response.url
  37. #print "Processed profile %s" % response.url
  38. yield profile
  39. # 提取头像地址
  40. def get_header_url(self, response):
  41. header = response.xpath('//dl[@class="personal_cen"]/dt/img/@src').extract()
  42. if len(header) > 0:
  43. header_url = header[0]
  44. else:
  45. header_url = ""
  46. return header_url.strip()
  47. # 提取用户名
  48. def get_username(self, response):
  49. usernames = response.xpath('//dl[@class="personal_cen"]/dd/div/strong/text()').extract()
  50. if len(usernames) > 0:
  51. username = usernames[0]
  52. else:
  53. username = ""
  54. return username.strip()
  55. # 提取内心独白
  56. def get_monologue(self, response):
  57. monologues = response.xpath('//ul[@class="requre"]/li/p/text()').extract()
  58. if len(monologues) > 0:
  59. monologue = monologues[0]
  60. else:
  61. monologue = ""
  62. return monologue.strip()
  63. # 提取相册图片地址
  64. def get_pic_urls(self, response):
  65. pic_urls = []
  66. data_url_full = response.xpath('//li[@class="smallPhoto"]/@data_url_full').extract()
  67. if len(data_url_full) <= 1:
  68. pic_urls.append("");
  69. else:
  70. for pic_url in data_url_full:
  71. pic_urls.append(pic_url)
  72. if len(pic_urls) <= 1:
  73. return ""
  74. return '|'.join(pic_urls)
  75. # 提取年龄
  76. def get_age(self, response):
  77. age_urls = response.xpath('//dl[@class="personal_cen"]/dd/p[@class="local"]/text()').extract()
  78. if len(age_urls) > 0:
  79. age = age_urls[0]
  80. else:
  81. age = ""
  82. age_words = re.split(' ', age)
  83. if len(age_words) <= 2:
  84. return "0"
  85. #20岁
  86. age = age_words[2][:-1]
  87. if self.pattern.match(age):
  88. return age
  89. return "0"