抓取百度贴吧

采集 网络爬虫吧 的所有贴吧信息

http://tieba.baidu.com/f?ie=utf-8&kw=%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB&fr=search

解决问题思路:

  • 确认需求数据在哪

    右键查看源代码

  • Fidder模拟发送数据

源码

  1. # -*- coding:utf-8 -*-
  2. import urllib2
  3. import urllib
  4. from lxml import etree
  5. import chardet
  6. import json
  7. import codecs
  8. def GetTimeByArticle(url):
  9. request = urllib2.Request(url)
  10. response = urllib2.urlopen(request)
  11. resHtml = response.read()
  12. html = etree.HTML(resHtml)
  13. time = html.xpath('//span[@class="tail-info"]')[1].text
  14. print time
  15. return time
  16. def main():
  17. output = codecs.open('tieba0812.json', 'w', encoding='utf-8')
  18. for pn in range(0, 250, 50):
  19. kw = u'网络爬虫'.encode('utf-8')
  20. url = 'http://tieba.baidu.com/f?kw=' + urllib.quote(kw) + '&ie=utf-8&pn=' + str(pn)
  21. print url
  22. request = urllib2.Request(url)
  23. response = urllib2.urlopen(request)
  24. resHtml = response.read()
  25. print resHtml
  26. html_dom = etree.HTML(resHtml)
  27. # print etree.tostring(html_dom)
  28. html = html_dom
  29. # site = html.xpath('//li[@data-field]')[0]
  30. for site in html.xpath('//li[@data-field]'):
  31. # print etree.tostring(site.xpath('.//a')[0])
  32. title = site.xpath('.//a')[0].text
  33. Article_url = site.xpath('.//a')[0].attrib['href']
  34. reply_date = GetTimeByArticle('http://tieba.baidu.com' + Article_url)
  35. jieshao = site.xpath('.//*[@class="threadlist_abs threadlist_abs_onlyline "]')[0].text.strip()
  36. author = site.xpath('.//*[@class="frs-author-name j_user_card "]')[0].text.strip()
  37. lastName = site.xpath('.//*[@class="frs-author-name j_user_card "]')[1].text.strip()
  38. print title, jieshao, Article_url, author, lastName
  39. item = {}
  40. item['title'] = title
  41. item['author'] = author
  42. item['lastName'] = lastName
  43. item['reply_date'] = reply_date
  44. print item
  45. line = json.dumps(item, ensure_ascii=False)
  46. print line
  47. print type(line)
  48. output.write(line + "\n")
  49. output.close()
  50. print 'end'
  51. if __name__ == '__main__':
  52. main()