• Python爬虫(八)


    源码:

     1 import requests
     2 import re
     3 from my_mysql import MysqlConnect
     4 import time,random
     5 
     6 
     7 # 获取招聘详情链接
     8 def get_urls(page, headers):
     9     url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=page'.format(page)
    10     response = requests.get(url, headers=headers)
    11     pat = r'href="(position_detail.*?)">'
    12     url_list_bytes = re.findall(pat.encode('utf-8'), response.content)
    13     return url_list_bytes
    14 
    15 # 获取招聘详情
    16 def get_info(url, headers):
    17     response = requests.get(url, headers=headers)
    18     html_bytes = response.content
    19     # print(html_bytes)
    20 
    21     # title 标题
    22     pat = r'id="sharetitle">(.*?)</td>'
    23     res = re.search(pat.encode('utf-8'), html_bytes)
    24     title = res.group(1).decode('utf-8')
    25     # address 地点
    26     pat = r'工作地点:</span>(.*?)</td>'
    27     res = re.search(pat.encode('utf-8'), html_bytes)
    28     address = res.group(1).decode('utf-8')
    29     # types 类别
    30     pat = r'职位类别:</span>(.*?)</td>'
    31     res = re.search(pat.encode('utf-8'), html_bytes)
    32     types = res.group(1).decode('utf-8')
    33     # counts 人数
    34     pat = r'招聘人数:</span>(.*?)</td>'
    35     res = re.search(pat.encode('utf-8'), html_bytes)
    36     counts = res.group(1).decode('utf-8')
    37     # duty 职责
    38     pat = r'工作职责.*?<ul class="squareli">(.*?)</ul>'
    39     res = re.search(pat.encode('utf-8'), html_bytes)
    40     duty_str = res.group(1).decode('utf-8')
    41     pat = r'<li>(.*?)</li>'
    42     duty = re.findall(pat,duty_str)
    43     duty = ('
    ').join(duty)
    44     # requires 要求
    45     pat = r'工作要求.*?<ul class="squareli">(.*?)</ul>'
    46     res = re.search(pat.encode('utf-8'), html_bytes)
    47     requires_str = res.group(1).decode('utf-8')
    48     pat = r'<li>(.*?)</li>'
    49     requires = re.findall(pat, requires_str)
    50     requires = ('
    ').join(requires)
    51     return title,address,types,counts,duty,requires
    52 
    53 
    54 if __name__ == '__main__':
    55     mc = MysqlConnect('127.0.0.1','root','123456','homework')
    56     sql = "insert into tencentzp(title,address,types,counts,duty,requires) values(%s,%s,%s,%s,%s,%s)"
    57     headers = {
    58         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    59     }
    60     for page in range(0,200,10):
    61         url_list_bytes = get_urls(page,headers)
    62         # print(url_list_bytes)
    63         for url in url_list_bytes:
    64             # print(url.decode('utf-8'))
    65             url = 'https://hr.tencent.com/' + url.decode('utf-8')
    66             info = get_info(url,headers)
    67             print(info)
    68             mc.exec_data(sql,info)
    69             time.sleep(random.random()*5)
  • 相关阅读:
    webpack配置之代码优化
    react组件生命周期
    javascript记住用户名和登录密码
    ajax异步请求原理和过程
    深入理解ajax系列第五篇——进度事件
    ajax多次请求,只执行最后一次的方法
    CentOS6.8下MySQL MHA架构搭建笔记
    HTTP状态码
    什么是 Redis 事务?原理是什么?
    Redis 通讯协议是什么?有什么特点?
  • 原文地址:https://www.cnblogs.com/zhxd-python/p/9501321.html
Copyright © 2020-2023  润新知