文件相关函数与工具函数
将回调式的风格封装成 Promise
风格,ensureSavaPath
里面其实没有必要用 try/catch ,直接用 catch 回调反而更简洁,当然不想封装这些函数,使用 fs-extra
里面的 API 同样可以快速的完成同样功能。
// 保存文件
const saveFile = (path, data) => {
return new Promise((resolve, reject) => {
jsonfile.writeFile(path, data, e => {
e && reject(e)
resolve()
})
})
}
// 等待
const sleep = time =>
new Promise(resolve => {
setTimeout(resolve, time)
})
// 判断文件是否存在
const exits = path => {
return new Promise((resolve, reject) => {
fs.stat(path, (err, stats) => {
err && reject(err)
resolve(stats)
})
})
}
const ensureSavaPath = path => exits(path).catch(() => mkdir(path))
那么可不可以优化一下呢?写成一行要注意,因为使用了逗号,避免被认为是函数的参数分割,导致 yes
不被调用,可以加一个括号括起来。最外层的也可以写成一行,外层真写成一行,那就过分了。
function ready() {
let resolveFN, rejectFN
let promise = new Promise(
(resolve, reject) => ([resolveFN, rejectFN] = [resolve, reject])
)
return [resolveFN, rejectFN, promise]
}
// 保存文件
const saveFile = (path, data) => {
const [yes, no, wait] = ready()
jsonfile.writeFile(path, data, e => (e && no(e), yes()))
return wait
}
const sleep = time => {
const [yes, no, wait] = ready()
setTimeout(yes, time)
return wait
}
const exits = path => {
const [yes, no, wait] = ready()
fs.stat(path, (e, stats) => (e && no(e), yes(stats)))
return wait
}
const ensureSavaPath = path => exits(path).catch(() => mkdir(path))
对于 callback 的处理,有一部分共用了,同样可以提取出来。
const callbackHandler = (yes, no) => (e, ...args) => (e && no(e), yes(...args))
jsonfile.writeFile(path, data, callbackHandler(yes, no))
fs.stat(path, callbackHandler(yes, no))
那么继续抽象的话,其实对于这种 node 统一风格的非常容器转换成 promise
,使用 promisify 即可,类似于 Rxjs
的 bindNodeCallback
。
const { promisify } = require('util')
function ready() {
let resolveFN, rejectFN
let promise = new Promise(
(resolve, reject) => ([resolveFN, rejectFN] = [resolve, reject])
)
return [resolveFN, rejectFN, promise]
}
const sleep = time => {
const [yes, no, wait] = ready()
setTimeout(yes, time)
return wait
}
// 保存文件
const saveFile = promisify(jsonfile.writeFile)
const exits = promisify(fs.stat)
const ensureSavaPath = path => exits(path).catch(() => mkdir(path))
核心逻辑
下载章节
了解了基础知识后,我们进入正题,crawl
是爬取规则,不把规则写在内部是为了解耦,毕竟以后扩展可不只能对一个网站有效果,这里把 url 传递进去是为了方便拼接,有的网站用的是相对目录,统一处理成绝对目录。
// 网址 - 爬取规则 - 选项{等待事件,并发数}
async function downloadChapter(url, crawl, opts) {
const { path, charset } = opts
const selector = await buildSelector(url, charset)
const datas = crawl.chapter(selector, url)
await saveFile(resolve(path, 'chapters.json'), datas)
}
下载内容
当下载内容时候,同样适用 craw
提供的爬取内容的规则。
async function downloadText(chapter, crawl, index, opts) {
const { path, charset } = opts
const selector = await buildSelector(chapter.url, charset)
const text = crawl.text(selector)
await saveFile(resolve(path, `text/${index}-${chapter.title}.json`), text)
}