爬取赶集网二手物品下所有物品的信息

爬取赶集网二手物品下所有物品的信息。
大致思路：
1、爬取频道页url；
2、爬取商品详情页url，写入mongodb，url_list表；
3、从url_list表读取url，爬取商品信息，写入mongodb，p_info表

分成3个py文件：
1、channel_url.py，获取频道页url；
2、page_parse.py，主要是2个爬虫函数，分别完成2个数据表的写入；
3、main.py，主程序文件，也就是本文件，开启多进程，完成整个工作

最后顺利完成任务，感觉现在赶集真不行了，没多少数据。

channel_url.py文件：

import requests
from bs4 import BeautifulSoup

start_url = 'http://bj.ganji.com/wu/'
url_host = 'http://bj.ganji.com'


def get_channel_url(url):
    channel_urls = []
    raw_data = requests.get(url).text
    soup = BeautifulSoup(raw_data,'lxml')
    eles = soup.select('div.content dt>a')
    for e in eles:
        channel_url = url_host + e.get('href')
        print(channel_url)
        channel_urls.append(channel_url)
    return channel_urls

# channel_urls = get_channel_url(start_url)
# print('len(channel_urls):',len(channel_urls))

# 这是程序运行的结果，直接保存下来了，就不用再运行get_channel_url()了
channel_urls = '''
    http://bj.ganji.com/jiaju/
    http://bj.ganji.com/rirongbaihuo/
    http://bj.ganji.com/shouji/
    http://bj.ganji.com/bangong/
    http://bj.ganji.com/nongyongpin/
    http://bj.ganji.com/jiadian/
    http://bj.ganji.com/ershoubijibendiannao/
    http://bj.ganji.com/ruanjiantushu/
    http://bj.ganji.com/yingyouyunfu/
    http://bj.ganji.com/diannao/
    http://bj.ganji.com/xianzhilipin/
    http://bj.ganji.com/fushixiaobaxuemao/
    http://bj.ganji.com/meironghuazhuang/
    http://bj.ganji.com/shuma/
    http://bj.ganji.com/laonianyongpin/
    http://bj.ganji.com/xuniwupin/
    http://bj.ganji.com/qitawupin/
    http://bj.ganji.com/ershoufree/
    http://bj.ganji.com/wupinjiaohuan/
'''

page_parse.py文件：

import requests
from bs4 import BeautifulSoup
from time import sleep
from pymongo import MongoClient

client = MongoClient('localhost',27017)
ganji = client['ganji']
url_list = ganji['url_list']
p_info = ganji['p_info']

# 给定频道url，爬取此频道下所有商品的url，打印并写入mongo数据库
def get_product_url(url):
    channel_url = url
    page_num = 1
    while True:
        raw_page = requests.get(url).text
        print('正在get网页:',url)
        sleep(2)
        soup = BeautifulSoup(raw_page,'lxml')
        eles = soup.select('a.ft-tit')
        print('len(eles):',len(eles))
        for e in eles:
            p_url = e.get('href')
            url_list.insert_one({'p_url':p_url})
            print(p_url)
        if soup.select('a.next'):
            page_num += 1
            url = channel_url + 'o' + str(page_num) + '/'
        else:
            break

# 给定商品详情页url,爬取商品具体信息，打印并写入mongo数据库
def get_product_info(url):
    raw_page = requests.get(url).text
    sleep(2)
    soup = BeautifulSoup(raw_page,'lxml')

    if soup.select("p:contains('信息刚被删除~')"):   # 判断商品信息是否已经删除
        print('信息刚被删除~')
        pass
    else:
        title = soup.select('h1.title-name')[0].get_text() if soup.select('h1.title-name') else None
        category = list(soup.select('div.crumbs.routes.clearfix')[0].stripped_strings) if soup.select('div.crumbs.routes.clearfix') else None
        date = soup.select('i.pr-5')[0].get_text().split('\')[0].strip() if soup.select('i.pr-5') else None
        price = soup.select('i.f22.fc-orange.f-type')[0].get_text() if soup.select('i.f22.fc-orange.f-type') else None
        address = soup.select('ul.det-infor>li:nth-child(2)>a')[0].get_text() if soup.select('ul.det-infor>li:nth-child(2)>a') else None
        p_dict = {'title':title,'category':category,'date':date,'price':price,'address':address,'url':url}
        p_info.insert_one(p_dict)
        print(p_dict)

main.py文件：

from channel_url import channel_urls     # 从channel_url.py导入某变量，会把channel_url.py都执行一遍，但变量只在模块内部保留
from page_parse import get_product_url, get_product_info, url_list   # 需要导入url_list
from multiprocessing import Pool
from datetime import datetime

# 从mongodb中读取商品url，返回所有商品的url
def read_all_p_urls():
    all_p_urls = []
    for item in url_list.find():
        all_p_urls.append(item['p_url'])
    return all_p_urls


if __name__ == '__main__':
    start_time = datetime.now()
    
    # 不用多进程的方式，耗时会多好几倍
    # for channel in channel_urls.split():
    #     get_product_url(channel)

    pool = Pool()
    #用多进程的方式，4进程和自动分配进程耗时差不多
    #pool = Pool(processes=4)

    # 根据channel url，获取商品url，写入mongodb
    pool.map(get_product_url,channel_urls.split())

    # 根据商品url，获取商品信息，写入mongodb；这一句可以跟上面那句分开执行
    pool.map(get_product_info,read_all_p_urls())

    end_time = datetime.now()
    during = end_time - start_time
    print('总共耗时：',during)

相关阅读:
javacv 通过rtsp 获取视频流设置帧率
 Firefly-RK3399 上编译安装 OpenCV 3
萤火虫系统(firefly) RK3399 python3 安装 tensorflow
Linux环境创建交换分区
 python模块发布
 Docker学习
 好书收集
 python 内存管理和垃圾回收机制
 redis加分布式锁
 邀请关系设计
原文地址：https://www.cnblogs.com/djlbolgs/p/12539821.html