• 爬取智联招聘


    import urllib.request
    import urllib.parse
    from bs4 import BeautifulSoup
    import json
    
    
    class ZhiLianSpider(object):
        url = "https://sou.zhaopin.com/?"
    
        def __init__(self, jl, kw, start_page, end_page):
            self.jl = jl
            self.kw = kw
            self.start_page = start_page
            self.end_page = end_page
            self.items = []  # 定义一个空列表,存放所有的工作信息
    
        # 解析html文件,提取所需的内容
        def parse_content(self, content):
            soup = BeautifulSoup(content, 'html.parser')
            table_list = soup.select('#listContent > table')[1:]
            for table in table_list:
                zwmc = table.select('.zwmc > div > a')[0].text
                gsmc = table.select('.gsmc > a')[0].text
                zwyx = table.select('.zwyx')[0].text
                gzdd = table.select('.gzdd')[0].text
                gxsj = table.select('.gxsj > span')[0].text
                item = {
                    '职位名称': zwmc,
                    '公司名称': gsmc,
                    '职位月薪': zwyx,
                    '工作地点': gzdd,
                    '更新时间': gxsj,
                }
                self.items.append(item)
    
        # 启动爬虫
        def run(self):
            for page in range(self.start_page, self.end_page+1):
                request = self.handler_request(page)  # 构建request对象
                content = urllib.request.urlopen(request).read().decode()  # 发起get请求,获得html文件
                self.parse_content(content)
            string_items = json.dumps(self.items, ensure_ascii=False)  # 将列表类型转化为字符串类型
            with open("zhilian.txt", "w", encoding="utf-8") as f:      # 设置ensure_ascii,打开txt文件时显示中文
                f.write(string_items)
    
        def handler_request(self, page):  # 处理url,构建request对象
            data = {
                'jl': self.jl,
                'kw': self.kw,
                'p': page
            }
            get_url = ZhiLianSpider.url + urllib.parse.urlencode(data)  # url中有中文,需要urlencode编码
            # print(get_url)
            headers = {
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple
                              WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
            }
            request = urllib.request.Request(url=get_url, headers=headers)
            return request
    
    
    def main():
        jl = input("请输入工作地点:")
        kw = input("请输入工作关键词:")
        start_page = int(input("请输入查询起始页面:"))
        end_page = int(input("查询结束页面:"))
    
        # 创建对象,启动爬取程序
        spider = ZhiLianSpider(jl, kw, start_page, end_page)
        spider.run()
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    Linux任务前后台的切换
    如何给html元素的onclick事件传递参数即如何获取html标签的data
    关键词多空格处理
    tp3常量
    php 正则判断是否是手机号码
    thinkphp 初始化
    删除图标
    time() 在thinkphp 3.2.3 模板格式化输出
    iOS工程如何支持64-bit
    调试instruments
  • 原文地址:https://www.cnblogs.com/nxrs/p/11360056.html
Copyright © 2020-2023  润新知