nodejs 读取博客园自己博客列表生成全部标题列表

需求

自己的博文有823篇了，但是不能在一页里面显示，本来想通过 MetaWeblog 读取，后来发现失败了。
没办法，自己写个脚本读取吧。

之前MetaWeblog 读取失败的文章：https://www.cnblogs.com/pengchenggang/p/16593890.html

原理

装一个axios，get读取一个页面，这样就获得了html，再通过正则提取出标题和链接，存在数组里面。

经过递归，刷一遍列表分页。为了保证顺序，所以要第一页读取完成后，再进行第二也的加载。

上截图

上代码

// cnblogs-node.js
const axios = require("axios")
const fs = require('fs')
const pageTotal = 22
let currPage = 1
let dataArr = []

getPageContent(1) // 从第1页到22页 递归调用

function getPageContent (page) {
  axios.get('https://www.cnblogs.com/pengchenggang/default.html?page='+page).then(res => {
    // console.info('res.data', res.data)
    const content = res.data
    const mainStartPosition = '<!--end: header 头部 -->'
    const mainEndPosition = '<!--end: main -->'
    let pos1 = 0, pos2 = 0
    pos1 = content.indexOf(mainStartPosition) + mainStartPosition.length
    pos2 = content.indexOf(mainEndPosition) + mainEndPosition.length
    console.info('pos1', pos1)
    console.info('pos2', pos2)

    const listContent = content.substring(pos1, pos2)
    // console.info('listContent', listContent)

    const reg = new RegExp(/<a class=\"postTitle2[\s\S]*?<\/a>/, "g")
    // const reg = new RegExp(/<a\b[^>]+\bhref="([^"]*)"[^>]*>([\s\S]*?)</a>/, "g");

    if (reg.test(listContent)) {
      // 找到内容了
      console.info('匹配上拉')
      const ret = getRegExec(reg, listContent)
      const ret1 = formatRet(ret) // 格式化所需要的对象
      // console.log("ret1", ret1)
      console.log("第"+ page + "页 读取完成")
      dataArr.push(...ret1)
      if (page === pageTotal) {
        fs.writeFile('data.json', JSON.stringify(dataArr), function () {
          console.info('data.json 写入完成！')
        })
        const renderHtml = getRenderHtml(dataArr)
        fs.writeFile('html.html', renderHtml, function () {
          console.info('html.html 写入完成！')
        })
      } else {
        getPageContent(page + 1)
      }
    } else {
      console.info('没有匹配上')
    }
  })
}

function getRenderHtml(arr) {
  return arr.map(item => {
    return '<a href="'+item.link+'" target="_blank">'+item.title+'</a>'
  }).join('\n')
}
function formatRet (arr) {
  let ret = []
  arr.map(item => {
    ret.push(getLinkAndTitle(item))
  })
  return ret
}

function getLinkAndTitle (str) {
  // const mock = '<a class="postTitle2 vertical-middle" href="https://www.cnblogs.com/pengchenggang/p/16601242.html">\n' +
  //   '    <span>\n' +
  //   '        软件发布时 生成发布日志文件 单点登录 getGitInfo.bat\n' +
  //   '    </span>\n' +
  //   '    \n' +
  //   '</a>'

  const reg2 = new RegExp(/href="([\s\S]*?)">[\s\S]*?<span>([\s\S]*?)<\/span>/, "g")
  const m2 = reg2.exec(str)
  // console.info('m2', m2)
  const link = m2[1]
  const title = m2[2].replace(/\\n/, 'g').trim()
  // console.info('link title', link, title)
  return {
    title,
    link
  }
}

function getRegExec (reg, txt) {
  var m
  var ret = []
  do {
    m = reg.exec(txt)
    // console.info('m', m)
    if (m) {
      ret.push(m[0])
    }
  } while (m)
  return ret
}

package.json

{
  "name": "cnblogs-nodejs-script",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "run": "node cnblogs-node.js"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "axios": "^0.27.2"
  }
}

all cnblogsByPengchenggang 样式显示模板

<!DOCTYPE html>
<html lang="en">

<head>
  <meta charset="UTF-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>all cnblogsByPengchenggang</title>
  <style>
    a {
      font-size: 14px;
      /* display: block; */
      /* margin: 5px 20px; */
      color: #666666;
    }

    a:hover {
      color: #333333;
      font-size: 14px;
      background-color: #f2f2f2;
    }

    .list {
      counter-reset: main;
      /*  800px; */
      margin: 0 auto;
    }

    a::before {
      counter-increment: main;
      content: counter(main);
      padding-right: 10px;
    }
  </style>
</head>

<body class="">
  <div class="list">
    $TemplateListSlot
  </div>
</body>

</html>

复盘

这里主要3个知识点

读取数据用axios
解析数据用正则
存储数据用nodejs

相关阅读:
django模型中的抽象类（abstract）
http，tcp，udp的报文格式
关于HTTP请求GET和POST的区别
SQL语言分为四类，每类分别是？各包括什么？
Python中为什么可以通过bin(n & 0xffffffff)来获得负数的补码？
python中sorted和sorted 、reversed和reverse的使用。
Django Cannot assign "A1": "B1" must be a "C1" instance. 错误信息
python反转链表和成对反转
Python单例模式的四种方法
python的列表list和集合set操作

原文地址：https://www.cnblogs.com/pengchenggang/p/16611780.html

nodejs 读取博客园自己博客列表 生成全部标题列表

需求

原理

上截图

上代码

复盘

nodejs 读取博客园自己博客列表生成全部标题列表