• 并发爬取直聘网招聘信息


    #并发爬取直聘网找招聘信息
    import re import json import urllib import urllib.parse from urllib.request import urlopen from multiprocessing import Pool def get_content(target_url):#获取页面信息 headers = {'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'} req = urllib.request.Request(url=target_url, headers=headers) content = urllib.request.urlopen(req).read().decode("utf-8") return content #把返回值给回调函数get_msg def get_msg(content): obj = re.compile(r'ka="search_list_.*?<div class="job-title">(?P<job>.*?)</div>' r'.*?<span class="red">(?P<salary>.*?)</span>' r'.*?<p>(?P<adress>.*?)<em class="vline"></em>(?P<jingyan>.*?)<em class="vline"></em>(?P<xueli>.*?)</p>' r'.*?target="_blank">(?P<company>.*?)</a></h3>', re.S) f=open("java.txt","a",encoding="utf-8") g = obj.finditer(content) for el in g: dic = {'job': el.group("job"), "salary": el.group("salary"), 'company': el.group("company")} # dic = {'job': el.group("job"), "salary": el.group("salary"), 'company': el.group("company"), # "经验": el.group("jingyan"), "地址": el.group("adress"), "学历": el.group("xueli")} # dic = {'job': el.group("job"), "salary": el.group("salary"), 'company': el.group("company"), # "经验": el.group("jingyan"), "地址": el.group("adress"), "学历": el.group("xueli")} print(dic) s = json.dumps(dic, ensure_ascii=False) f.write(s + " ") f.close() if __name__ == '__main__': word = "开发" #目标地址中的中文 word = urllib.parse.quote(word) #通过urllib.parse.quote(word)使得目标网址中的中文可以访问,进行爬取 url_lst = [] # 装目标网址 for i in range(1, 11):#循环得到10个目标网页 url = "https://www.zhipin.com/c101280600/?query=Java%s&page=%s&ka=page-%s" % (word, i, i) url_lst.append(url) p=Pool(4) #创建进程池,可以同时进行4个任务 #目标任务有10个页面 for url in url_lst: p.apply_async(get_content,args=(url,),callback=get_msg) p.close() #不在向进程池中添加任务 p.join() #主进程等到子进程的结束
  • 相关阅读:
    Linux:文件解压与压缩
    Linux:环境变量
    Linux:Vim
    Linux:目录&文件基本操作
    Linux:文件权限
    Linux:用户&用户组操作
    Linux:Shell 常用通配符
    Linux:常用shell快捷键
    Linux:Linux 重要人物
    架构:层次化
  • 原文地址:https://www.cnblogs.com/knighterrant/p/10040604.html
Copyright © 2020-2023  润新知