• Puppeteer 前端截图 爬虫


    https://zhuanlan.zhihu.com/p/76237595

    https://juejin.cn/post/6882332163052994574

    const puppeteer = require('puppeteer');
    const fs = require('fs');
    const path = require('path');

    String.prototype.hashCode = function () {
    var hash = 0;
    if (this.length == 0)
    return hash;
    for (let i = 0; i < this.length; i++) {
    var charCode = this.charCodeAt(i);
    hash = ((hash << 7) - hash) + charCode;
    hash = hash & hash;
    }
    return hash;
    };

    async function getWelfareImage (url) {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    page.setViewport({ 1280, height: 926 });

    console.log("start..." + url)
    let hash = url.hashCode()
    fs.mkdir("./images/" + hash, { recursive: true }, (err) => {
    if (err) {
    throw err;
    }
    })
    fs.writeFile(`images/${hash}/url`, url, (err) => {
    if (err) {
    console.log(err)
    }
    })
    page.on('response', async (response) => {
    // const matches = /upload.*\.(jpg|png|svg|gif)$/.exec(response.url());
    const matches = /upload.*\/([^\/]*)\.(jpg|png|svg)$/.exec(response.url());
    if (matches && (matches.length === 3)) {
    const name = matches[1];
    const extension = matches[2];
    const buffer = await response.buffer();
    fs.writeFileSync(`images/${hash}/image-${name}.${extension}`, buffer, 'base64');
    }
    });

    await page.goto(url
    , {
    waitUntil: 'networkidle2',
    timeout: 0
    });

    console.log("end..." + url)
    await browser.close();
    }


    async function getAllThreadList () {
    console.log("start...")
    fs.mkdir("./images", { recursive: true }, (err) => {
    if (err) {
    throw err;
    }
    })

    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    page.on('load', async () => {
    console.log('Page loaded!')

    const urls = await page.evaluate(() => {
    console.log('Page evaluate!')
    let refs = document.querySelectorAll('a')
    let url = []
    for (let i = 0; i < refs.length; i++) {
    u = refs[i].getAttribute("href")
    if (!u.startsWith('thread-index-fid-')) {
    continue;
    }
    url.push(u)
    }
    return url
    }
    )
    console.log(urls)
    for (let i = 0; i < urls.length; i++) {
    await getWelfareImage('http://www.btbtt17.com/' + urls[i])
    }

    await browser.close();
    console.log("end...")
    })

    await page.goto('http://www.btbtt17.com/xxxx'
    , {
    waitUntil: 'load',
    timeout: 0
    });
    }

    // getAllThreadList()

    var urls = [
    ]
    for (let index = 0; index < urls.length; index++) {
    const element = urls[index];
    getWelfareImage(element)
    }
  • 相关阅读:
    org.dom4j.DocumentException: Error on line 1 of document: 前言中不允许有内容
    学习过程中的随手笔记
    IT技术团队行而有效的管理之道
    九宫格抽奖HTML+JS版
    Nginx负载均衡深入浅出
    PHP 二维数组根据某个字段排序
    MYSQL INSERT INTO SELECT 不插入重复数据
    小米2成功使用google组件的办法(为了coc游戏能登录google账户)
    PHP 数组排序方法总结
    普通标和转让标的回款和还款日期的算法。
  • 原文地址:https://www.cnblogs.com/xuxm2007/p/16731081.html
Copyright © 2020-2023  润新知