文件相关函数与工具函数

将回调式的风格封装成 Promise 风格,ensureSavaPath 里面其实没有必要用 try/catch ,直接用 catch 回调反而更简洁,当然不想封装这些函数,使用 fs-extra 里面的 API 同样可以快速的完成同样功能。

  1. // 保存文件
  2. const saveFile = (path, data) => {
  3. return new Promise((resolve, reject) => {
  4. jsonfile.writeFile(path, data, e => {
  5. e && reject(e)
  6. resolve()
  7. })
  8. })
  9. }
  10. // 等待
  11. const sleep = time =>
  12. new Promise(resolve => {
  13. setTimeout(resolve, time)
  14. })
  15. // 判断文件是否存在
  16. const exits = path => {
  17. return new Promise((resolve, reject) => {
  18. fs.stat(path, (err, stats) => {
  19. err && reject(err)
  20. resolve(stats)
  21. })
  22. })
  23. }
  24. const ensureSavaPath = path => exits(path).catch(() => mkdir(path))

那么可不可以优化一下呢?写成一行要注意,因为使用了逗号,避免被认为是函数的参数分割,导致 yes 不被调用,可以加一个括号括起来。最外层的也可以写成一行,外层真写成一行,那就过分了。

  1. function ready() {
  2. let resolveFN, rejectFN
  3. let promise = new Promise(
  4. (resolve, reject) => ([resolveFN, rejectFN] = [resolve, reject])
  5. )
  6. return [resolveFN, rejectFN, promise]
  7. }
  8. // 保存文件
  9. const saveFile = (path, data) => {
  10. const [yes, no, wait] = ready()
  11. jsonfile.writeFile(path, data, e => (e && no(e), yes()))
  12. return wait
  13. }
  14. const sleep = time => {
  15. const [yes, no, wait] = ready()
  16. setTimeout(yes, time)
  17. return wait
  18. }
  19. const exits = path => {
  20. const [yes, no, wait] = ready()
  21. fs.stat(path, (e, stats) => (e && no(e), yes(stats)))
  22. return wait
  23. }
  24. const ensureSavaPath = path => exits(path).catch(() => mkdir(path))

对于 callback 的处理,有一部分共用了,同样可以提取出来。

  1. const callbackHandler = (yes, no) => (e, ...args) => (e && no(e), yes(...args))
  2. jsonfile.writeFile(path, data, callbackHandler(yes, no))
  3. fs.stat(path, callbackHandler(yes, no))

那么继续抽象的话,其实对于这种 node 统一风格的非常容器转换成 promise,使用 promisify 即可,类似于 RxjsbindNodeCallback

  1. const { promisify } = require('util')
  2. function ready() {
  3. let resolveFN, rejectFN
  4. let promise = new Promise(
  5. (resolve, reject) => ([resolveFN, rejectFN] = [resolve, reject])
  6. )
  7. return [resolveFN, rejectFN, promise]
  8. }
  9. const sleep = time => {
  10. const [yes, no, wait] = ready()
  11. setTimeout(yes, time)
  12. return wait
  13. }
  14. // 保存文件
  15. const saveFile = promisify(jsonfile.writeFile)
  16. const exits = promisify(fs.stat)
  17. const ensureSavaPath = path => exits(path).catch(() => mkdir(path))

核心逻辑

下载章节

了解了基础知识后,我们进入正题,crawl 是爬取规则,不把规则写在内部是为了解耦,毕竟以后扩展可不只能对一个网站有效果,这里把 url 传递进去是为了方便拼接,有的网站用的是相对目录,统一处理成绝对目录。

  1. // 网址 - 爬取规则 - 选项{等待事件,并发数}
  2. async function downloadChapter(url, crawl, opts) {
  3. const { path, charset } = opts
  4. const selector = await buildSelector(url, charset)
  5. const datas = crawl.chapter(selector, url)
  6. await saveFile(resolve(path, 'chapters.json'), datas)
  7. }

下载内容

当下载内容时候,同样适用 craw 提供的爬取内容的规则。

  1. async function downloadText(chapter, crawl, index, opts) {
  2. const { path, charset } = opts
  3. const selector = await buildSelector(chapter.url, charset)
  4. const text = crawl.text(selector)
  5. await saveFile(resolve(path, `text/${index}-${chapter.title}.json`), text)
  6. }