以https://fxhblog.top/为爬取对象,实现一个简单的网页抓取器

  • 在node环境下安装superagent和cheerio
cnpm install superagent --save
cnpm install cheerio --save
  • 构造XHR请求
superAgent.get(aimUrl).end(function (err, res) {
    if (err) {
        console.log("访问出错");
    }
    let $ = cheerio.load(res.text);
});
let $ = cheerio.load(res.text);//cheerio中类似于jq的写法

  • 将获取到的response分析后,提取有用的信息,比如文章的URL等等
let allLink = $('.post-type-normal .post-block link');
allLink.each(function (index, current) {
    let _this = $(current);
    let currentArticle = {};
    currentArticle.articleUrl = encodeURI(_this["0"].attribs.href);
    currentArticle.articleTitle = getTitleByUrl(_this["0"].attribs.href);
    currentArticle.No = index;
    articleList.push(currentArticle);
    let promiseSpiders = constructPromiseSpider(articleList);
});
const getTitleByUrl = (_url) => {
    const title = _url.split("/");
    return title[title.length - 2];
};//获取每篇文章的标题
  • 将每一个文章URL单独包裹在一个promise中用于爬取
const constructPromiseSpider = (data) => {
    return data.map(function (current) {
        return new Promise(function (resolve, reject) {
            superAgent.get(current.articleUrl).end(function (err, res) {
                if (err) {
                    console.log("爬取文章详情出错");
                }
                res.No = current.No;
                res.Url = current.articleUrl;
                resolve(res);
            });
        });
    });
};
  • 当页面爬取完成调用Promise.all([传入一个promise对象数组])进行第二次数据分析提取有价值的信息,比如文章发表时间,文章字数,文章分类等等
const statisticArticle = (resArray) => {
    resArray.forEach(function (currentRes) {
        let $ = cheerio.load(currentRes.text);
        let currentArticle = {
            "文章描述信息": {}
        };
        currentArticle["编号"] = currentRes.No;
        currentArticle["文章地址"] = currentRes.Url;
        currentArticle["标题"] = $(".post-header .post-title").text();
        currentArticle["文章描述信息"]["发表时间"] = $(".post-header .post-meta time").text().split("\n").join("").trim();
        currentArticle["文章描述信息"]["文章分类"] = $(".post-header .post-meta span a").attr("href");
        currentArticle["文章描述信息"]["文章字数"] = $(".post-header .post-meta .post-wordcount .post-meta-item-text").next().text().split("\n").join("").split("                  ")[1];
        currentArticle["文章描述信息"]["阅读次数"] = $(".post-header .post-meta .post-wordcount .post-meta-item-text").next().text().split("\n").join("").split("                  ")[2].trim();
        currentArticle["文章内容"] = $(".post-body").text();
        articleDetailList.push(currentArticle);
    });
    return articleDetailList;
};
  • 将构造好的数据存入本地JSON文件中,完成网页爬取
const createJSON = (data) => {
    let _data = {};
    data.forEach(function (current) {
        _data[`编号${current["编号"]}`] = current;
    });
    const dataToJson = JSON.stringify(_data);
    fs.writeFile("./FxhBlog.json", dataToJson, function (err) {
        if (err) {
            console.log("写入失败", err);
        }
        console.log("Success!");
    });
};

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注