一、爬虫项目
1、爬虫基础
a、网页上面会有相同的数据
b、去重处理
布隆过滤器
哈希存储
c、标签匹配:
正则表达式
beautiful soup或lxml这种标签提取库
d、动态内容
phantomjs
selenium
二、爬豆瓣网电影
网站地址:https://www.douban.com/
准备工作:
1、在数据库中创建表
movie.sql
CREATE TABLE `movie_info` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `movie_id` int(11) unsigned NOT NULL COMMENT '电影id', `movie_name` varchar(100) COMMENT '电影名称', `movie_pic` varchar(200) COMMENT '电影图片', `movie_director` varchar(50) COMMENT '电影导演', `movie_writer` varchar(50) COMMENT '电影编剧', `movie_country` varchar(50) COMMENT '电影产地', `movie_language` varchar(50) COMMENT '电影语言', `movie_main_character` varchar(50) COMMENT '电影主演', `movie_type` varchar(50) COMMENT '电影类型', `movie_on_time` timestamp DEFAULT '0000-00-00 00:00:00' COMMENT '电影上映时间', `movie_span` varchar(20) COMMENT '电影时长', `movie_grade` varchar(5) COMMENT '电影评分', `remark` varchar(500) DEFAULT '' COMMENT '备注', `_create_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '创建时间', `_modify_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间', `_status` tinyint(1) DEFAULT '1', PRIMARY KEY (`id`), KEY `idx_movie_id` (`movie_id`), KEY `idx_create_time` (`_create_time`), KEY `idx_modify_time` (`_modify_time`) ) ENGINE=InnoDB AUTO_INCREMENT=20 DEFAULT CHARSET=utf8 COMMENT='电影信息表';
2、创建一个新项目
D:GoFilessrcweb>bee new crawl_movice ______ | ___ | |_/ / ___ ___ | ___ / _ / _ | |_/ /| __/| __/ \____/ \___| \___| v1.10.0 2019/02/16 10:49:19 INFO ▶ 0001 Creating application... create D:GoFilessrcwebcrawl_movice create D:GoFilessrcwebcrawl_moviceconf create D:GoFilessrcwebcrawl_movicecontrollers create D:GoFilessrcwebcrawl_movicemodels create D:GoFilessrcwebcrawl_movice outers create D:GoFilessrcwebcrawl_movice ests create D:GoFilessrcwebcrawl_movicestatic create D:GoFilessrcwebcrawl_movicestaticjs create D:GoFilessrcwebcrawl_movicestaticcss create D:GoFilessrcwebcrawl_movicestaticimg create D:GoFilessrcwebcrawl_moviceviews create D:GoFilessrcwebcrawl_moviceconfapp.conf create D:GoFilessrcwebcrawl_movicecontrollersdefault.go create D:GoFilessrcwebcrawl_moviceviewsindex.tpl create D:GoFilessrcwebcrawl_movice outers outer.go create D:GoFilessrcwebcrawl_movice estsdefault_test.go create D:GoFilessrcwebcrawl_movicemain.go 2019/02/16 10:49:20 SUCCESS ▶ 0002 New application successfully created!