• Python 爬邮箱


    利用Python实现了从网站中爬取邮箱的功能,以下为代码实现。

    """ 网络爬虫爬邮箱 """
    
    from bs4 import BeautifulSoup
    import requests
    import requests.exceptions
    from urllib.parse import urlsplit
    from collections import deque
    import re
    import os
    import csv
    
    class EmailCrawler:
        """ 邮箱爬虫 """
    
        # 邮箱正则表达式
        __email_addr_pattern = r"[a-z0-9.-+_]+@[a-z0-9.-+_]+.[a-z]+"
    
        def crawl(self, urls):
            """ 
            爬取
            
    参数: urls - 网址列表或者文件(.txt,.csv)
             """
            new_urls = deque()       # 网址列表
            processed_urls = set()  # 已爬的网址
            emails = set()          # 邮箱地址
    
            if type(urls) is deque:
                new_urls = urls
            elif type(urls) is list:
                new_urls = deque(urls)
            elif type(urls) is str:
                data = list()
                if os.path.exists(urls):
                    data = self.__readCSVData(urls)
                else:
                    data = urls.split(',')
                new_urls = deque(data)
            else:
                print("不支持的参数!")
                return emails        
    
            """ 开始爬取 """
            # 遍历网址直到结束
            while len(new_urls):
                # 从队列头部推出一个网址
                url = new_urls.popleft()
                processed_urls.add(url)
    
                # 提取基本网址与路径已解决相对链接
                parts = urlsplit(url)
                base_url = "{0.scheme}://{0.netloc}".format(parts)
                path = url[:url.rfind('/')+1] if '/' in parts.path else url
    
                # 获取网址内容
                print("Processing %s" %url)
                try:
                    response = requests.get(url)
                except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
                    # 忽略页面错误
                    continue
    
                # 提取页面中的所有email,并且将它们添加到结果集
                new_emails = set(re.findall(self.__email_addr_pattern, response.text, re.I))
                if len(new_emails) > 0:
                    emails.update(new_emails)
                    print(new_emails)
    
                # 给文档创建beautiful soup
                soup = BeautifulSoup(response.text, features="lxml")
    
                # 找到并处理文档中所有的锚
                for anchor in soup.find_all('a'):
                    # 从锚中提取链接
                    link = anchor.attrs['href'] if 'href' in anchor.attrs else ''
                    # 处理内部链接
                    if link.startswith('/'):
                        link = base_url + link
                    elif not link.startswith('http'):
                        link = path + link
            
                    # 添加新链接
                    if not link in new_urls and not link in processed_urls:
                        new_urls.append(link)return emails
    
        def __readCSVData(self, filename):
            """ 读取文件 """
            data = list()
            with open(filename, 'r') as f:
                f_csv = csv.reader(f)
                for row in f_csv:
                    data.append(row[0])
            return dataif __name__ == '__main__':
    
        # urls = 'http://www.themoscowtimes.com'
        # urls = ['http://www.themoscowtimes.com']
        urls = 'urls.txt'
        emailCrawl = EmailCrawler()
        emails = emailCrawl.crawl(urls)

    以上代码参考自:http://scraping.pro/simple-email-crawler-python/

    另有一个开源的从文本中提取邮件地址的项目:https://pypi.org/project/email-scraper/

  • 相关阅读:
    BZOJ 4555: [Tjoi2016&Heoi2016]求和 [分治FFT 组合计数 | 多项式求逆]
    BZOJ 4555: [Tjoi2016&Heoi2016]求和 [FFT 组合计数 容斥原理]
    BZOJ 4259: 残缺的字符串 [FFT]
    BZOJ 2287. [HZOI 2015]疯狂的机器人 [FFT 组合计数]
    BZOJ 4503: 两个串 [FFT]
    CF528D. Fuzzy Search [FFT]
    快速傅里叶变换 & 快速数论变换
    UOJ#77. A+B Problem [可持久化线段树优化建边 最小割]
    BZOJ 2034: [2009国家集训队]最大收益 [贪心优化 Hungary]
    BZOJ 4276: [ONTAK2015]Bajtman i Okrągły Robin [线段树优化建边]
  • 原文地址:https://www.cnblogs.com/ziyu-trip/p/12672838.html
Copyright © 2020-2023  润新知