国家食品药品监督管理总局

http://app1.sfda.gov.cn/datasearch/face3/dir.html

items.py:添加以下代码

  1. from scrapy import Field
  2. import scrapy
  3. class Sfda1Item(scrapy.Item):
  4. # define the fields for your item here like:
  5. data = scrapy.Field()

在spiders目录下新建一个自定义spider

  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from scrapy.http import FormRequest
  4. from tutorial.items import Sfda1Item
  5. import urllib
  6. import re
  7. class sfdaSpider(scrapy.Spider):
  8. name = 'sfda'
  9. allowed_domains = ['sfda.gov.cn']
  10. def start_requests(self):
  11. url = 'http://app1.sfda.gov.cn/datasearch/face3/search.jsp'
  12. data = {
  13. 'tableId': '32',
  14. 'State': '1',
  15. 'bcId': '124356639813072873644420336632',
  16. 'State': '1',
  17. 'tableName': 'TABLE32',
  18. 'State': '1',
  19. 'viewtitleName': 'COLUMN302',
  20. 'State': '1',
  21. 'viewsubTitleName': 'COLUMN299,COLUMN303',
  22. 'State': '1',
  23. 'curstart': '1',
  24. 'State': '1',
  25. 'tableView': urllib.quote("国产药品商品名"),
  26. 'State': '1',
  27. }
  28. yield FormRequest(url=url, formdata=data, meta={'data': data}, callback=self.parseContent)
  29. def parseContent(self, response):
  30. for site in response.xpath('//a').re(r'callbackC,\'(.*?)\',null'):
  31. id = re.search('.+Id=(.*?)$', site).group(1)
  32. #print id
  33. url = 'http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=32&tableName=TABLE32&tableView=%B9%FA%B2%FA%D2%A9%C6%B7%C9%CC%C6%B7%C3%FB&Id=' + id
  34. yield scrapy.Request(url,
  35. headers={
  36. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
  37. },
  38. callback=self.ParseDetail)
  39. data = response.meta['data']
  40. data['curstart'] = str(int(data['curstart']) + 1)
  41. yield FormRequest(url=response.request.url, formdata=data, meta={'data': data}, callback=self.parseContent)
  42. def ParseDetail(self, response):
  43. item = dict()
  44. for site in response.xpath('//table[1]/.//tr')[1:-1]:
  45. try:
  46. if not site.xpath('./td/text()').extract()[0]:
  47. continue
  48. name = site.xpath('./td/text()').extract()[0]
  49. value = re.sub('<.*?>', '', site.xpath('./td')[1].extract()).strip()
  50. print name, value
  51. item[name] = value
  52. except Exception, e:
  53. print 'error', e
  54. sfa = Sfda1Item()
  55. sfa['data'] = item
  56. yield sfa

在pipelines.py:添加如下代码

  1. import json
  2. import codecs
  3. class JsonWriterPipeline(object):
  4. def __init__(self):
  5. self.file = codecs.open('items.json', 'w', encoding='utf-8')
  6. def process_item(self, item, spider):
  7. line = json.dumps(dict(item), ensure_ascii=False) + "\n"
  8. self.file.write(line)
  9. return item
  10. def spider_closed(self, spider):
  11. self.file.close()

settings.py:添加如下代码(启用组件)

  1. ITEM_PIPELINES = {
  2. 'tutorial.pipelines.JsonWriterPipeline': 300,
  3. }

在项目根目录下新建main.py文件,用于调试

  1. from scrapy import cmdline
  2. cmdline.execute('scrapy crawl sfda -L INFO'.split())