美团App热门商圈团购采集(2)

把上节内容生成的城市信息 items.json改成city_items.json作为第二部分爬虫的启动数据

添加items.py

  1. class MeituanItem(Item):
  2. data = Field()

创建模板:

  1. scrapy genspider -t basic Meituan_meishi meituan.com

添加以下代码到Meituan_meishi.py

  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import codecs
  4. import json
  5. from tutorial.items import MeituanItem
  6. import re
  7. class MeituanMeishiSpider(scrapy.Spider):
  8. '''
  9. 美食团购页面信息采集
  10. '''
  11. name = "Meituan_meishi"
  12. allowed_domains = ["meituan.com"]
  13. '''
  14. start_urls = (
  15. 'http://www.meituan.com/',
  16. )
  17. '''
  18. offset = 0
  19. def start_requests(self):
  20. file = codecs.open('city_items.json', 'r', encoding='utf-8')
  21. for line in file:
  22. item = json.loads(line)
  23. cityid = item['data']['cityid']
  24. latitude = item['data']['latitude']
  25. longitude= item['data']['longitude']
  26. lat = round(float(latitude), 6)
  27. lng= round(float(longitude), 6)
  28. url = 'http://api.mobile.meituan.com/group/v4/deal/select/city/42/cate/1?sort=defaults&mypos='+ str(lat) +'%2C'+ str(lng) +'&offset=0&limit=15'
  29. yield scrapy.Request(url,callback=self.parse)
  30. break
  31. file.close()
  32. def parse(self, response):
  33. '''
  34. 数据存储以及翻页操作
  35. '''
  36. item = MeituanItem()
  37. data = json.loads(response.body)
  38. item['data']=dict()
  39. item['data'] = data
  40. yield item
  41. offset = re.search('offset=(\d+)',response.request.url).group(1)
  42. url = re.sub('offset=\d+','offset='+str(int(offset)+15),response.request.url)
  43. yield scrapy.Request(url,callback=self.parse)

运行:

  1. scrapy runspider tutorial/spiders/Meituan_meishi.py