多线程抓取邮箱

# -*- coding: utf-8 -*-
"""
@author: Dell Created on Sun Dec 29 17:26:43 2019
"""
import re
import time
import queue
import threading
import requests


def getpagesource(url):
    """获取网页源码"""
    try:
        resp = requests.get(url)
        if resp.status_code == 200:
            return resp.content.decode("utf-8")
    except:
        return ""
    pass

def getemaillist(page_source):
    """根据网页源代码抓取一个页面的所有邮箱"""
    try:
        pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)"
        return re.compile(pattern, re.IGNORECASE).findall(page_source)#忽略异常情况和大小写
    except:
        return ""
    pass

def gethostname(url):
    """获取域名"""
    try:
        return re.compile(r"(http://S*?)/", re.IGNORECASE).findall(url)[0]
    except:
        return ""
    pass

def getabsurl(page_source):
    """获取所有绝对路径的超链接"""
    try:
        return re.compile(r"(http://s*S*?)["|>|)]", re.IGNORECASE).findall(page_source)
    except:
        return ""
    pass

def getrelurl(url, page_source):
    """获取一个页面所有相对路径的url"""
    links = re.compile(r'href="(.*?)"',re.IGNORECASE).findall(page_source)
    links_bak = links.copy()#深拷贝
    for link in links_bak:
        # 去掉所有绝对链接和非链接，保留所有相对链接
        if (link.find("http://") != -1 or link.find("https://") != -1 
            or link.find("javascript") != -1 or link.find("#") != -1):
            links.remove(link)
        
    hostname = gethostname(url)#获取域名，拼接完整链接
    if hostname != None:
        for i in range(len(links)):
            if links[i].startswith("/"):
                links[i] = hostname + links[i]
            else:
                links[i] = hostname + "/" + links[i]
    return links
    pass

def getallurl(page_source):
    """获取所有的超链接"""
    allurllist = []#保存所有的超链接
    absurllist = []#绝对路径的超链接
    relurllist = []#相对路径拼接的超链接
    
    absurllist = getabsurl(page_source)
    if len(absurllist) > 0:
        relurllist = getrelurl(absurllist[0], page_source)
    
    allurllist.extend(absurllist)
    allurllist.extend(relurllist)
    return allurllist
    pass

def saveemail():#每过5s执行一次保存
    global email_queue
    file = open("mail.txt", "ab")
    while True:
        time.sleep(5)
        while not email_queue.empty():
            email = email_queue.get()
            file.write((email+"
").encode("utf-8", "ignore"))
            file.flush()#实时写入
    file.close()
    pass

def BFS(url, email_queue, url_queue):
    url = url_queue.get()#取出url
    page_source = getpagesource(url)#抓取页面源码
    emaillist = getemaillist(page_source)
    if len(emaillist) != 0:
        for email in emaillist:
            email_queue.put(email)
            print(email)

    urllist = getallurl(page_source)#提取页面链接压入队列
    if len(urllist) != 0:
        for myurl in urllist:
            url_queue.put(myurl)#将url压入队列
    pass


def executeBFS(url,email_queue,url_queue):
    url_queue.put(url)#给定初始值
    global sem
    with sem:#限定线程的数量
        while True:
            while not url_queue.empty():
                for i in range(101):
                    threading.Thread(target=BFS, args=(url,email_queue,url_queue)).start()
    pass


if __name__ == "__main__":
    email_queue = queue.Queue()#邮箱队列
    url_queue = queue.Queue()
    
    sem = threading.Semaphore(20)#控制最大线程数为100
    timerthd = threading.Timer(5, saveemail)#5s以后开启一个线程将邮箱保存到文件
    timerthd.start()
    
    url = "http://bbs.tianya.cn/post-140-393974-1.shtml"
    executeBFS(url,email_queue,url_queue)
    pass

相关阅读:
Gradle Gretty进行runAppDebug的Listening for transport dt_socket at address: 5005 的后续配置
 Oracle :value too large for column "SCHEMA"."TABLE"."COLUMN" (actual: 519, maximum: 500)的解决方案
 js file对象文件大小转换可视容易阅读的单位
 JS的Event各种属性级target/currentTarget/relatedTarget各种目录的解释
 浏览器控制台是否打开的一些措施的讨论
 eclipse启动指定jvm的版本
 IDEA terminal无法从vim的编辑模式转换为命令模式
 win7 64位系统在IronPython2.7 rc安装后运行出现"ipy64/ipy.exe"does not exist解决办法
 VS2010 插件 CSS3 IS 2.1.1 在win7 64位机子上安装小记
 Asp.net ajax 1.0 绑定drowdownlist时取值问题
原文地址：https://www.cnblogs.com/zxfei/p/12116726.html