• Golang框架beego电影网爬虫小试牛刀


    学习了一段时间golang,又参考课程学习了beego开发网站爬虫,项目的目录结构是:

    采集的目标是豆瓣网电影,入口地址是:https://movie.douban.com/subject/1900841/?from=subject-page

    数据结果

    数据表结构

    CREATE TABLE `movie_info` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `movie_id` int(11) unsigned NOT NULL COMMENT '电影id',
      `movie_name` varchar(100) DEFAULT NULL COMMENT '电影名称',
      `movie_pic` varchar(200) DEFAULT NULL COMMENT '电影图片',
      `movie_director` varchar(50) DEFAULT NULL COMMENT '电影导演',
      `movie_writer` varchar(50) DEFAULT NULL COMMENT '电影编剧',
      `movie_country` varchar(50) DEFAULT NULL COMMENT '电影产地',
      `movie_language` varchar(50) DEFAULT NULL COMMENT '电影语言',
      `movie_main_character` varchar(50) DEFAULT NULL COMMENT '电影主演',
      `movie_type` varchar(50) DEFAULT NULL COMMENT '电影类型',
      `movie_on_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '电影上映时间',
      `movie_span` varchar(20) DEFAULT NULL COMMENT '电影时长',
      `movie_grade` varchar(5) DEFAULT NULL COMMENT '电影评分',
      `remark` varchar(500) DEFAULT '' COMMENT '备注',
      `create_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '创建时间',
      `modify_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间',
      `status` tinyint(1) DEFAULT '1',
      PRIMARY KEY (`id`),
      KEY `idx_movie_id` (`movie_id`),
      KEY `idx_create_time` (`create_time`),
      KEY `idx_modify_time` (`modify_time`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='电影信息表';

    文件

    app.conf文件,用来配置数据库

    appname = mypro
    httpport = 8080
    runmode = dev
    dbhost = 127.0.0.1
    dbport = 3306
    dbname = myblog
    dbuser = root
    dbpwd = root

     

    路由文件router.go

    //router.go文件
    package routers
    
    import (
        "mypro/controllers"
        "github.com/astaxie/beego"
    )
    
    func init() {
        beego.Router("/", &controllers.MainController{})
        beego.Router("/collection", &controllers.CollectionController{})
    }

    控制器下文件

    package controllers
    
    import (
        "fmt"
        "github.com/astaxie/beego"
        "github.com/astaxie/beego/httplib"
        "mypro/models"
        "time"
    )
    
    type CollectionController struct {
        beego.Controller
    }
    
    func (c *CollectionController) Get() {
    
        sUrl := "https://movie.douban.com/subject/1900841/?from=subject-page" //这里作为入口
        models.PutinUrlQueue(sUrl)
        models.ConnectRedis("127.0.0.1:6379") //连接redis
    
    
        for {
            var MovieInfo models.MovieInfo
            UrlQueueLength := models.GetQueueLength()
            c.Ctx.WriteString(fmt.Sprintf("---%v---", UrlQueueLength))
            if UrlQueueLength == 0 {
                break;
            }
    
            //从队列中取出url
            sUrl = models.PopfromQueue()
    
            //如果url在集合里,那么过滤掉
            if models.IsVisit(sUrl) {
                continue
            }
    
            rsp := httplib.Get(sUrl)
            //设置User-agent以及cookie是为了防止  豆瓣网的 403
            rsp.Header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
            rsp.Header("Cookie", `bid=gFP9qSgGTfA; __utma=30149280.1124851270.1482153600.1483055851.1483064193.8; __utmz=30149280.1482971588.4.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118221"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1483064193%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=5afcf5e5496eab22.1482413017.7.1483066280.1483057909.; __utma=223695111.1636117731.1482413017.1483055857.1483064193.7; __utmz=223695111.1483055857.6.5.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _vwo_uuid_v2=BDC2DBEDF8958EC838F9D9394CC5D9A0|2cc6ef7952be8c2d5408cb7c8cce2684; ap=1; viewed="1006073"; gr_user_id=e5c932fc-2af6-4861-8a4f-5d696f34570b; __utmc=30149280; __utmc=223695111; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1483064193; __utmb=223695111.0.10.1483064193`)
            sMovieHtml, err := rsp.String()
    
        //    fmt.Print(sMovieHtml)
    
            if err != nil {
                panic(err)
            }
    
            MovieInfo.Movie_name = models.GetMovieName(sMovieHtml)
    
        //    fmt.Print(MoveInfo.Movie_name)
    
            if MovieInfo.Movie_name != "" { //如果为空,则说明不是电影,如果不为空,则是电影
                //获取电影导演
                MovieInfo.Movie_director = models.GetMovieDirector(sMovieHtml)
                //获取主演
                MovieInfo.Movie_main_character = models.GetMovieMainCharacters(sMovieHtml)
                //电影类型
                MovieInfo.Movie_type = models.GetMovieGenre(sMovieHtml)
                //上映时间
                MovieInfo.Movie_on_time = models.GetMovieOnTime(sMovieHtml)
                //评分
                MovieInfo.Movie_grade = models.GetMovieGrade(sMovieHtml)
                //时长
                MovieInfo.Movie_span = models.GetMovieRunningTime(sMovieHtml)
                 //  c.Ctx.WriteString(fmt.Sprintf("%v", MovieInfo))
                //入库
                //models.AddMovieToDb(&MovieInfo)
                //fmt.Println(MovieInfo)
                //fmt.Println(&MovieInfo)
                //os.Exit(1)
                id, err := models.AddMovieToDb(&MovieInfo)
                  fmt.Println(err)
                  c.Ctx.WriteString(fmt.Sprintf("%v", id))
            }
            //提取该页面的所有连接
            urls := models.GetMovieUrls(sMovieHtml)
    
            //遍历url
            //为了把url写入队列
            //同样需要开启一个协程,这个协程专门负责从队列中取,负责get,set,
            //第一判断这个url是不是一个电影,是的话加入到数据库,
            //    第二是提取这个电影有关的url
            //第三把url放入set(集合)里,表明这个url已经访问过
            for _, url := range urls {
                models.PutinUrlQueue(url)
            //    c.Ctx.WriteString("<br>" + url + "</br>")
            }
            //sUrl 需要记录到set集合里,表明这个url访问过
            models.AddToSet(sUrl)
            time.Sleep(time.Second) //适当休息
        }
        c.Ctx.WriteString("爬虫执行结束")
        //models.PutinUrlQueue(sUrl)
        //c.Data["Website"] = "beego.me"
        //c.Data["Email"] = "astaxie@gmail.com"
        //c.TplName = "index.tpl"
    }

    models目录下文件

    package models
    
    import (
        "github.com/astaxie/beego"
        "regexp" //正则包
        "strings"
    
        //"strings"
        "github.com/astaxie/beego/orm"
        _ "github.com/go-sql-driver/mysql"
    )
    
    var (
        db orm.Ormer
    )
    
    type MovieInfo struct {
        Id                   int64
        Movie_id             int64
        Movie_name           string
        Movie_pic            string
        Movie_director       string
        Movie_writer         string
        Movie_country        string
        Movie_language       string
        Movie_main_character string
        Movie_type           string
        Movie_on_time        string
        Movie_span           string
        Movie_grade          string
        Create_time          string
    }
    
    func init(){
        orm.Debug = true //是否开启调试模式,调试模式下会打印sql语句
        dbhost := beego.AppConfig.String("dbhost")
        dbport := beego.AppConfig.String("dbport")
        dbname := beego.AppConfig.String("dbname")
        dbuser := beego.AppConfig.String("dbuser")
        dbpwd := beego.AppConfig.String("dbpwd")
        orm.RegisterDataBase("default", "mysql", dbuser + ":" + dbpwd + "@tcp("+dbhost + ":" + dbport + ")/" + dbname + "?charset=utf8")
        orm.RegisterModel(new(MovieInfo))
        db = orm.NewOrm()
    }
    
    
    //获取电影名称
    func GetMovieName(html string) string{
        var movieName string
        movieName = ""
        if html != "" {
            reg := regexp.MustCompile(`<spans*property="v:itemreviewed">(.*?)</span>`)
            result := reg.FindAllStringSubmatch(html, -1)
            if len(result) != 0 {
                movieName = string(result[0][1])
            }
        }
        return movieName
    }
    
    //获取导演
    func GetMovieDirector(html string) string {
        var movieDirector string
        movieDirector = ""
        if html != "" {
            reg := regexp.MustCompile(`<a.*?rel="v:directedBy">(.*)</a>`)
            result := reg.FindAllStringSubmatch(html, -1)
            if len( result[0]) > 0 && result[0][1] != "" {
                movieDirector = string(result[0][1])
            }
        }
        return movieDirector
    }
    
    //获取主演
    func GetMovieMainCharacters(html string) string {
        var movieMainCharacters string
        movieMainCharacters = ""
        if html != "" {
            reg := regexp.MustCompile(`<a.*?rel="v:starring">(.*?)</a>`)
            result := reg.FindAllStringSubmatch(html, -1)
            if len(result) != 0 {
                for _, v := range result {
                    movieMainCharacters += v[1] + "/"
                }
                movieMainCharacters = strings.Trim(movieMainCharacters, "/")
            }
        }
        return movieMainCharacters
    }
    
    
    //获取电影类型
    func GetMovieGenre(html string) string {
        var movieGenre string
        movieGenre = ""
        if html != ""{
            reg := regexp.MustCompile(`<span.*?property="v:genre">(.*?)</span>`)
            result := reg.FindAllStringSubmatch(html, -1)
            if len(result) != 0 {
                for _, v := range result {
                    movieGenre += v[1] + "/"
                }
            }
            movieGenre = strings.Trim(movieGenre, "/")
        }
        return  movieGenre
    }
    
    //获取电影上映时间
    func GetMovieOnTime(html string) string {
        var movieOnTime string
        movieOnTime = ""
        if html != "" {
            reg := regexp.MustCompile(`<span.*?property="v:initialReleaseDate".*?>(.*?)</span>`)
            result := reg.FindAllStringSubmatch(html, -1)
            if len(result) != 0 {
                movieOnTime = string(result[0][1])
            }
        }
        return movieOnTime
    }
    
    //获取评分
    func GetMovieGrade(html string) string {
        var movieGrade string
        movieGrade = ""
        if html != "" {
            reg := regexp.MustCompile(`<strong.*?property="v:average">(.*?)</strong>`)
            result := reg.FindAllStringSubmatch(html, -1)
    
            if len(result) != 0 {
                movieGrade = string(result[0][1])
            }
        }
        return movieGrade
    }
    
    //获取电影时长
    func GetMovieRunningTime(html string) string {
        var movieRunningTime string
        movieRunningTime = ""
        if html != "" {
            reg := regexp.MustCompile(`<span.*?property="v:runtime".*?>(.*?)</span>`)
            result := reg.FindAllStringSubmatch(html, -1)
    
            if len(result) != 0 {
                movieRunningTime = string(result[0][1])
            }
        }
        return movieRunningTime
    }
    
    
    
    //入库电影
    func AddMovieToDb(movie_info *MovieInfo) (int64, error) {
        id, err := db.Insert(movie_info)
        return id, err
    }
    
    
    //获取当前电影页下对的所有相关电影url
    func GetMovieUrls(html string) []string {
        reg := regexp.MustCompile(`<a.*?href="(https://movie.douban.com/.*?)"`)
        result := reg.FindAllStringSubmatch(html, -1)
    
        var movieSets []string
        for _, v := range result {
            movieSets = append(movieSets, v[1])
        }
    
        return movieSets
    }

    redis.go文件

    package models
    
    import (
        "github.com/astaxie/goredis"
    )
    var (
        RediCclient goredis.Client
    )
    
    const (
        URL_QUEUE     = "url_queue"     //作为队列标识
        URL_VISIT_SET = "url_visit_set" //记录曾经访问过的url
    )
    func ConnectRedis(addr string) {
        RediCclient.Addr = addr
    }
    
    //把提取的url放入队列
    func PutinUrlQueue(url string) {
        RediCclient.Lpush(URL_QUEUE, []byte(url))
    }
    
    //获取队列长度
    func GetQueueLength() int {
        length, err := RediCclient.Llen(URL_QUEUE)
        if err != nil {
            return 0
        }
        return length
    }
    
    //从队列里读取
    func PopfromQueue() string{
        res, err := RediCclient.Rpop(URL_QUEUE)
        if err != nil {
            panic(err)
        }
        return string(res)
    }
    
    // 把曾经访问过的加入一个集合
    func AddToSet(url string) {
        RediCclient.Sadd(URL_VISIT_SET, []byte(url))
    }
    
    
    
    //判断某个URL是否存在于集合中
    func IsVisit(url string) bool{
        isVisit, err := RediCclient.Sismember(URL_VISIT_SET, []byte(url))
        if err != nil {
            return false
        }
        return isVisit
    }
  • 相关阅读:
    Springboot 拦截器配置(登录拦截)
    SVN server 服务端修改端口号
    Latex 添加新的宏包
    鼠标右键快捷键修改的所对应注册表位置
    软件和电脑分辨率不一致解决办法——更改高DPI设置
    Latex中使用pdflatex编译图片出错:Unknown graphics extension: .eps. ...raphics[height=3.3cm]{figures/Var.eps}
    latex-TexStudio-tex源文件与pdf正反搜索(正反定位)设置
    假设检验
    大数据预处理技术
    chrome无法从该网站添加应用、扩展程序和用户脚本的有效解决方法!
  • 原文地址:https://www.cnblogs.com/wt645631686/p/9702572.html
Copyright © 2020-2023  润新知