Spider

Spider组件可以方便用户快速搭建分布式多协程爬虫,用户只需关心product和consume,product对dom的解析推荐使用Querylist

安装

  1. composer require easyswoole/spider

快速使用

以百度搜索为例,根据搜索关键词爬出每次检索结果前几页的特定数据 纯属教学目的,如有冒犯贵公司还请及时通知,会及时调整

Product

  1. <?php
  2. namespace App\Spider;
  3. use EasySwoole\HttpClient\HttpClient;
  4. use EasySwoole\Spider\Config\ProductConfig;
  5. use EasySwoole\Spider\Hole\ProductAbstract;
  6. use EasySwoole\Spider\ProductResult;
  7. use QL\QueryList;
  8. use EasySwoole\FastCache\Cache;
  9. class ProductTest extends ProductAbstract
  10. {
  11. public function product():ProductResult
  12. {
  13. // TODO: Implement product() method.
  14. // 请求地址数据
  15. $httpClient = new HttpClient($this->productConfig->getUrl());
  16. $httpClient->setHeader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36');
  17. $body = $httpClient->get()->getBody();
  18. // 先将每个搜索结果的a标签内容拿到
  19. $rules = [
  20. 'search_result' => ['.c-container .t', 'text', 'a']
  21. ];
  22. $searchResult = QueryList::rules($rules)->html($body)->query()->getData();
  23. $data = [];
  24. foreach ($searchResult as $result) {
  25. $item = [
  26. 'href' => QueryList::html($result['search_result'])->find('a')->attr('href'),
  27. 'text' => QueryList::html($result['search_result'])->find('a')->text()
  28. ];
  29. $data[] = $item;
  30. }
  31. $productJobOtherInfo = $this->productConfig->getOtherInfo();
  32. // 下一批任务
  33. $productJobConfigs = [];
  34. if ($productJobOtherInfo['page'] === 1) {
  35. for($i=1;$i<5;$i++) {
  36. $pn = $i*10;
  37. $productJobConfig = [
  38. 'url' => "https://www.baidu.com/s?wd={$productJobOtherInfo['word']}&pn={$pn}",
  39. 'otherInfo' => [
  40. 'word' => $productJobOtherInfo['word'],
  41. 'page' => $i+1
  42. ]
  43. ];
  44. $productJobConfigs[] = $productJobConfig;
  45. }
  46. $word = Cache::getInstance()->deQueue(self::SEARCH_WORDS);
  47. if (!empty($word)) {
  48. $productJobConfigs[] = [
  49. 'url' => "https://www.baidu.com/s?wd={$word}&pn=0",
  50. 'otherInfo' => [
  51. 'word' => $word,
  52. 'page' => 1
  53. ]
  54. ];
  55. }
  56. }
  57. $result = new ProductResult();
  58. $result->setProductJobConfigs($productJobConfigs)->setConsumeData($data);
  59. return $result;
  60. }
  61. }

Consume

我这里直接存文件了,可按照需求自己定制

  1. <?php
  2. namespace App\Spider;
  3. use EasySwoole\Spider\ConsumeJob;
  4. use EasySwoole\Spider\Hole\ConsumeAbstract;
  5. class ConsumeTest extends ConsumeAbstract
  6. {
  7. public function consume()
  8. {
  9. // TODO: Implement consume() method.
  10. $data = $this->getJobData();
  11. $items = '';
  12. foreach ($data as $item) {
  13. $items .= implode("\t", $item)."\n";
  14. }
  15. file_put_contents('baidu.txt', $items, FILE_APPEND);
  16. }
  17. }

注册爬虫组件

  1. public static function mainServerCreate(EventRegister $register)
  2. {
  3. $spiderConfig = [
  4. 'product' => ProductTest::class, // 必须
  5. 'consume' => ConsumeTest::class, // 必须
  6. 'queueType' => SpiderConfig::QUEUE_TYPE_FAST_CACHE, // 通信类型默认是fast-cache不支持分布式,如需分布式可使用SpiderConfig::QUEUE_TYPE_REDIS,或者自行实现通信队列
  7. 'queue' => '自定义队列,如使用组件自带则不需要', // 自定义通信队列
  8. 'queueConfig' => '自定义队列配置,目前只有SpiderConfig::QUEUE_TYPE_REDIS需要',
  9. 'maxCurrency' => 128 // 最大协程并发数(单台机器)
  10. ];
  11. SpiderServer::getInstance()
  12. ->setSpiderConfig($spiderConfig)
  13. ->attachProcess(ServerManager::getInstance()->getSwooleServer());
  14. }

投递任务

  1. $words = [
  2. 'php',
  3. 'java',
  4. 'go'
  5. ];
  6. foreach ($words as $word) {
  7. Cache::getInstance()->enQueue('SEARCH_WORDS', $word);
  8. }
  9. $wd = Cache::getInstance()->deQueue('SEARCH_WORDS');
  10. SpiderClient::getInstance()->addJob(
  11. 'https://www.baidu.com/s?wd=php&pn=0',
  12. [
  13. 'page' => 1,
  14. 'word' => $wd
  15. ]
  16. );