• TypeScript & Node.js crawler All In One


    TypeScript & Node.js crawler All In One

    
    "use strict";
    
    /**
     *
     * @author xgqfrms
     * @license MIT
     * @copyright xgqfrms
     * @created 2022-04-01
     * @modified
     *
     * @description TypeScript & Node.js crawler All In One
     * @augments
     * @example
     * @link https://www.cnblogs.com/xgqfrms/p/16086580.html
     *
     */
    
    import fs from "fs";
    import path from "path";
    
    import superagent from "superagent";
    import * as CheerioAPI from "cheerio";
    
    const log = console.log;
    
    
    log('ESM ❌ __dirname = \n', __dirname);
    log('commonjs ✅ __dirname = \n', __dirname);
    
    type Course = {
      // img?: string;
      img: string;
      title: string;
      value: number;
    }
    interface Courses {
      timestamp: number;
      // timestamp: Date;
      courses: Course[];
    }
    interface Content {
      [prop: number]: Course[];
    }
    class Crawler {
      private token: string = '';
      private url: string = 'https://cdn.xgqfrms/typescript/crawler/index.html';
      private HTMLStr: string = '';
      public loading: boolean = false;
      // public courses: Course[] = [];
      constructor() {
        this.init();
      }
      async init() {
        const html = await this.getHTMLStr();
        // const courses = this.parseHTML(this.HTMLStr);
        const courses = this.parseHTML(html);
        // log('courses =', courses);
        this.jsonGenerator(courses);
      }
      jsonGenerator(courses: Courses) {
        const folder = path.resolve(__dirname, '../data');
        // log('__dirname = \n', __dirname);
        log('folder = \n', folder)
        if (!fs.existsSync(folder)) {
          // 创建文件夹 ✅
          // fs.mkdirSync('./data', 0o744);
          fs.mkdirSync('./data');
          // fs.mkdirSync('./data', {
          //   recursive: true,
          //   mode: 0o744,
          // });
          // mode 默认值 0o744
          log('✅ create folder')
        } else {
          log('❌ create folder')
        }
        // fs.mkdirSync(__dirname, '../data');
        //  UnhandledPromiseRejectionWarning: TypeError [ERR_INVALID_ARG_VALUE]: The argument 'mode' must be a 32-bit unsigned integer or an octal string. Received '../data'
        const filePath = path.resolve(__dirname, '../data/courses.json');
        // const filePath = path.resolve(__dirname + '../data/courses.json');
        let fileContent: Content = {};
        if (fs.existsSync(filePath)) {
          // 初始化
          fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
        }
        // 追加
        fileContent[courses.timestamp] = courses.courses;
        // write 下载文件 ✅ ??? Node.js 控制浏览器,下载文件还是打开预览文件 ???
        fs.writeFileSync(filePath, JSON.stringify(fileContent, null, 4));
        const json = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
        log('json =', json);
      }
      parseHTML(html: string) {
        const courses: Course[] = [];
        // jQuery in HTML String
        const $ = CheerioAPI.load(html);
        const items: any = $('.course-item');
        // const items: Cheerio<Element> = $('.course-item');
        // Type 'Cheerio<Element>' is not an array type or a string type. Use compiler option '--downlevelIteration' to allow iterating of iterators.ts(2569)
        for (const item of [...items]) {
          const img = $(item).find('.course-img').attr('src') ?? '';
          // const img = $(item).find('.course-img').eq(0).attr('src');
          const desc = $(item).find('.course-desc');
          //  CheerioAPI.Cheerio<CheerioAPI.Element>
          const title = desc.eq(0).text();
          // const value = desc.eq(1).text().replace('当前课程学习人数:', '');
          const value = +desc.eq(1).text().replace('当前课程学习人数:', '');
          // const title = $(item).find('.course-desc').eq(0).text();
          // const value = $(item).find('.course-desc').eq(1).text();
          courses.push({
            img,
            title,
            value,
            // value: parseInt(value),
            // value: parseInt(value, 10),
          });
        }
        // console.log('courses =', courses);
        const timestamp = Date.now();
        return {
          timestamp,
          courses,
        };
      }
      async getHTMLStr() {
        const res = await superagent.get(this.url);
        // console.log('res', res);
        // console.log('res.text =\n', res.text);
        this.HTMLStr = res.text ?? '';
        // return Promise.resolve(this.HTMLStr);
        return res.text;
      }
    }
    
    export default Crawler;
    export {
      Crawler,
    };
    
    

    superagent

    $ npm i -D superagent
    
    
    # 类型注解
    $ npm i -D @types/superagent
    
    
    
    const superagent = require('superagent');
    
    // callback
    superagent
      .post('/api/pet')
      .send({ name: 'Manny', species: 'cat' }) // sends a JSON post body
      .set('X-API-Key', 'foobar')
      .set('accept', 'json')
      .end((err, res) => {
        // Calling the end function will send the request
      });
    
    // promise with then/catch
    superagent.post('/api/pet').then(console.log).catch(console.error);
    
    // promise with async/await
    (async () => {
      try {
        const res = await superagent.post('/api/pet');
        console.log(res);
      } catch (err) {
        console.error(err);
      }
    })();
    

    https://www.npmjs.com/package/superagent

    https://github.com/visionmedia/superagent

    cheerio

    DOM string parser

    $ npm i -D cheerio
    
    
    const cheerio = require('cheerio');
    const $ = cheerio.load('<h2 class="title">Hello world</h2>');
    
    $('h2.title').text('Hello there!');
    $('h2').addClass('welcome');
    
    $.html();
    // <html><head></head><body><h2 class="title welcome">Hello there!</h2></body></html>
    
    

    https://www.npmjs.com/package/cheerio

    https://github.com/cheeriojs/cheerio

    https://cheerio.js.org/

    demo

    cheerio get image tag src

      parseHTML(html: string) {
        const $ = CheerioAPI.load(html);
        // jQuery in HTML String
        const courses: any = $('.course-item');
        // console.log('courses =', courses);
        const data = [];
        // Type 'Cheerio<Element>' is not an array type or a string type. Use compiler option '--downlevelIteration' to allow iterating of iterators.ts(2569)
        for (const item of [...courses]) {
          const img = $(item).find('.course-img').attr('src');
          // const img = $(item).find('.course-img').eq(0).attr('src');
          const desc = $(item).find('.course-desc');
          const title = desc.eq(0).text();
          const value = desc.eq(1).text();
          // const title = $(item).find('.course-desc').eq(0).text();
          // const value = $(item).find('.course-desc').eq(1).text();
          data.push({
            img,
            title,
            value,
          });
        }
        console.log('data =', data);
        const timestamp = Date.now();
        return {
          timestamp,
        };
      }
    
    

    https://www.tabnine.com/code/javascript/functions/cheerio/src

    https://stackoverflow.com/questions/47542338/cheerio-get-image-src-with-no-class

    Node.js path.resolve

    path.resolve([...paths])

    path.resolve('/foo/bar', './baz');
    // Returns: '/foo/bar/baz'
    
    path.resolve('/foo/bar', '/tmp/file/');
    // Returns: '/tmp/file'
    
    path.resolve('wwwroot', 'static_files/png/', '../gif/image.gif');
    // If the current working directory is /home/myself/node,
    // this returns '/home/myself/node/wwwroot/static_files/gif/image.gif'
    
    

    https://nodejs.org/api/path.html#pathresolvepaths

    https://stackoverflow.com/questions/35048686/whats-the-difference-between-path-resolve-and-path-join

    pdf crawler / pdf 爬虫

    Node.js download pdf files / Node.js 下载 pdf 文件

    // esm / ts
    
    

    https://stackoverflow.com/questions/25945714/how-to-download-pdf-file-from-url-in-node-js

    const fs = require("fs");
    var path = require("path");
    const { exit } = require("process");
    const log = console.log;
    const request = require("request");
    // const request = require("request-promise-native");
    
    var folder = path.resolve(__dirname, '../pdf');
    
    // log('folder', folder);
    
    if (!fs.existsSync(folder)) {
      fs.mkdirSync(folder);
    }
    
    
    async function downloadPDF(url, filename) {
      log(' pdf downloading ...');
      const pdfBuffer = await request.get({
        uri: url,
        encoding: null,
        // encoding: 'utf-8',
      });
      fs.writeFileSync(filename, pdfBuffer);
      log('✅ pdf finished!');
      // exit 0;
    }
    
    const url = 'https://cs193p.sites.stanford.edu/sites/g/files/sbiybj16636/files/media/file/l1.pdf';
    const filename = folder + '/cs193p-2021-l1.pdf';
    
    
    // log('filename =', filename);
    
    downloadPDF(url, filename);
    
    
    

    https://github.com/request/request
    https://github.com/request/request#promises--asyncawait
    https://github.com/request/request-promise-nativ

    "use strict";
    
    /**
     *
     * @author xgqfrms
     * @license MIT
     * @copyright xgqfrms
     * @created 2022-04-01
     * @modified
     *
     * @description  Node.js pdf crawler
     * @augments
     * @example
     * @link
     *
     */
    
    // 0. commonjs module using `require` keyword
    const fs = require("fs");
    const path = require("path");
    const { exit } = require("process");
    
    // const request = require("request");
    // request 解析 pdf 错误
    // 1. just using `request` instead of `request-promise-native`, which is too slow!
    const request = require("request-promise-native");
    
    const log = console.log;
    
    // 2. custom download folder
    const folder = path.resolve(__dirname, '../pdf');
    // log('folder', folder);
    
    // 3. check if the folder exists, if not create it
    if (!fs.existsSync(folder)) {
      fs.mkdirSync(folder);
    }
    
    async function downloadPDF(url, filename) {
      log(' pdf downloading ...');
      const pdfBuffer = await request.get({
        uri: url,
        encoding: null,
        // encoding: 'utf-8',
      });
      // 4. write file to local file system
      fs.writeFileSync(filename, pdfBuffer);
      log('✅ pdf download finished!');
      // 5. exit the terminal after download finished
      exit(0);
    }
    
    const url = 'https://cs193p.sites.stanford.edu/sites/g/files/sbiybj16636/files/media/file/l1.pdf';
    const filename = folder + '/cs193p-2021-l1.pdf';
    // log('filename =', filename);
    
    downloadPDF(url, filename);
    
    
    

    https://nodejs.org/docs/v0.4.12/api/http.html#response.writeHead

    refs

    Stanford University Spring 2021 CS193p Course pdf

    https://www.youtube.com/watch?v=--qKOhdgJAs



    ©xgqfrms 2012-2020

    www.cnblogs.com/xgqfrms 发布文章使用:只允许注册用户才可以访问!

    原创文章,版权所有©️xgqfrms, 禁止转载 ️,侵权必究⚠️!


  • 相关阅读:
    jQuery中Ajax事件beforesend及各参数含义 转
    如何组织好js代码 转
    C# 几种HtmlEncode的区别
    [转]建议Font-Size使用em单位控制大小
    JavaScript charAt() 方法
    Js中 关于top、clientTop、scrollTop、offsetTop的用法
    jquery mobile 的4个初始化事件
    JS操作JSON总结
    .html(),.text()和.val()的使用
    netstat 显示当前网络连接的统计信息
  • 原文地址:https://www.cnblogs.com/xgqfrms/p/16086580.html
Copyright © 2020-2023  润新知