• nodejs实现抓取图片的爬虫脚本--crawler.js


    仅做了必要的注释,我太懒了。目前只是一个雏形,实现基本的需求。有时间会修改的完善一些。

    /*
     * @Author: jiahaiLiu
     * @Date:   2017-07-17 10:44:03
     * @Last Modified by:   jiahaiLiu
     * @Last Modified time: 2017-07-17 18:53:48
     * @Usage: node crawler [100]
     */
    
    'use strict';
    
    /*
     * Request is designed to be the simplest way possible to make http calls. 
     * It supports HTTPS and follows redirects by default.
     */
    const request = require('request');
    // cheerio是为服务器特别定制的,快速、灵活、实施的jQuery核心实现.
    const cheerio = require('cheerio');
    /*
     * Async is a utility module which provides straight-forward,
     * powerful functions for working with asynchronous JavaScript.
     */
    const async = require('async');
    const path = require('path');
    const fs = require('fs');
    const url = require('url');
    // 自定义下载图片数量,默认为100
    let targetAmount = process.argv.splice(2)[0] || 100;
    /*let getLink = 'http://image.so.com/j?q=%E7%BE%8E%E5%A5%B3&src=srp&correct=%E7%BE%8E%E5%A5%B3&sn=61&pn=60&sid=7e73fad3c0eb8367ede610dcf2784c0e&ran=0&ras=0';*/
    // 定义存储图片的文件夹名称
    let collect_pic_dir = './collect_pic/';
    
    let imgList = []; // 图片链接集合
    let dest,
        start = 0;
    let t1 = new Date().getTime();
    let urlObj = {
        protocol: 'http:',
        slashes: true,
        auth: null,
        host: 'image.so.com',
        port: null,
        hostname: 'image.so.com',
        hash: null,
        query: {
            q: '美女',
            src: 'srp',
            correct: '美女',
            sn: '0',
            pn: '60',
            sid: '7e73fad3c0eb8367ede610dcf2784c0e',
            ran: '0',
            ras: '0'
        },
        pathname: '/j',
    };
    let urlLink,
        resObj;
    
    
    
    if (!fs.existsSync(collect_pic_dir)) {
        fs.mkdirSync(collect_pic_dir);
        console.log('The ' + collect_pic_dir + ' folder has been created!');
    }
    
    loop(asyncDownload);
    
    function loop(cb) {
        urlLink = url.format(urlObj);
        console.log(urlLink);
        request(urlLink, function(err, res, body) {
            if (!err && res.statusCode === 200) {
                /*const $ = cheerio.load(body);
                JSON.parse($('script[id="initData"]').html()).list.forEach(function(item) {
                    imgList.push(item.img)
                });*/
                resObj = JSON.parse(res.body);
                /* resObj example
                            {
                                total: 1500,
                                end: false,
                                sid: "6b57a007f19740b44d562f6e0ec6e050",
                                ran: 0,
                                ras: 0,
                                lastindex: 121,
                                ceg: 181011782,
                                list: [{
                                    id: "7697671c2932936c55a39fd2e4d30ceb",
                                    qqface_down_url: false,
                                    downurl: false,
                                    grpmd5: false,
                                    type: 0,
                                    src: "1",
                                    index: 61,
                                    title: "<em>美女</em>诱惑_peaceful",
                                    litetitle: "",
                                     "1000",
                                    height: "1504",
                                    imgsize: "225KB",
                                    imgtype: "JPEG",
                                    key: "7913541bc5",
                                    dspurl: "blog.sina.com.cn",
                                    link: "http://blog.sina.com.cn/s/blog_a5bc8202010109ta.html",
                                    source: 2,
                                    img: "http://img165.poco.cn/mypoco/myphoto/20111030/05/54704062201110300502223689419360167_010.jpg",
                                    thumb_bak: "http://p0.so.qhmsg.com/t01da6596eb67097425.jpg",
                                    thumb: "http://p0.so.qhmsg.com/t01da6596eb67097425.jpg",
                                    _thumb_bak: "http://p0.so.qhmsg.com/sdr/_240_/t01da6596eb67097425.jpg",
                                    _thumb: "http://p0.so.qhmsg.com/sdr/_240_/t01da6596eb67097425.jpg",
                                    thumbWidth: 160,
                                    dsptime: "",
                                    thumbHeight: 240,
                                    grpcnt: "8",
                                    fixedSize: false
                                }],
                                boxresult: null,
                                wordguess: null
                        }
             */
                resObj.list.forEach(function(item) {
                    imgList.push(item.img);
                });
    
                if (imgList.length >= targetAmount) {
                    cb();
                } else {
                    if (!resObj.end) {
                        urlObj.query.sn = resObj.lastindex + 1;
                        urlObj.query.sid = resObj.sid;
                        loop(cb);
                    } else {
                        console.log('no more datas from source url');
                    }
                }
            }
        });
    }
    
    // download picture
    function asyncDownload() {
        console.log('图片总数:', imgList.length);
        async.mapSeries(imgList, function(item, callback) {
                setTimeout(function() {
                    downloadPic(item, collect_pic_dir + start + '.jpg');
                    callback(null, item);
                    start++;
                }, 400);
            }, function(err, results) {
                let t2 = new Date().getTime();
                console.log('全部完成,总耗时:', (t2 - t1) + 'ms';
                });
    
        }
    
        function downloadPic(src, dest) {
            request
                .get(src)
                .on('response', function(response) {
                    // console.log (response);
                    // console.log(response.statusCode) // 200 
                    // console.log(response.headers['content-type']) // 'image/png' 
                })
                .on('error', function(err) {
                    console.log(err)
                })
                .pipe(fs.createWriteStream(dest));
        }
  • 相关阅读:
    Laravel 初始化
    ant design pro 左上角 logo 修改
    请求到服务端后是怎么处理的
    Websocket 知识点
    王道数据结构 (7) KMP 算法
    王道数据结构 (6) 简单的模式匹配算法
    王道数据结构 (4) 单链表 删除节点
    王道数据结构 (3) 单链表 插入节点
    王道数据结构 (2) 单链表 尾插法
    王道数据结构 (1) 单链表 头插法
  • 原文地址:https://www.cnblogs.com/xiaohaifengke/p/7698913.html
Copyright © 2020-2023  润新知