• python 小爬虫爬取博客文章初体验


    最近学习 python 走火入魔,趁着热情继续初级体验一下下爬虫,以前用 java也写过,这里还是最初级的爬取html,都没有用html解析器,正则等。。。而且一直在循环效率肯定### 很低下

    import urllib.request as urllib2
    
    import random
    ua_list = [
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
            "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
            "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
        ]
    
    ua_agent = random.choice(ua_list)
    
    # ua_agent_dict = {'User-Agent':ua_agent}
    
    # print(ua_agent_dict)
    # request = urllib2.Request(url=url)
    # request.add_header(**ua_agent_dict)
    
    def checkPageExists(url,ua_agent):
        request = urllib2.Request(url=url)
        request.add_header('User_Agent',ua_agent)
        try:
            code = urllib2.urlopen(request).code
        except IOError as httperr:
            return False
        return True if code == 200 else False
    checkPageExists('https://www.cnblogs.com/Frank99/p/91111024.html',ua_agent=ua_agent)
    url_prefix = 'https://www.cnblogs.com/Frank99/p/'
    url_subfix = '.html'
    # https://www.cnblogs.com/Frank99/p/
    def getHtml(url,ua_agent):
        request = urllib2.Request(url=url)
        request.add_header('User_Agent',ua_agent)
        print('正在从页面 {} 读取数据......'.format(url))
        response = urllib2.urlopen(request)
        print('从页面 {} 读取数据完成......'.format(url))
        return response.read()
    
    def write_html2file(html,file_name):
        with open(file_name,'w',encoding='utf-8') as f:
            print('开始保存文件{}......'.format(file_name))
            f.write(html.decode())
            print('保存文件{}完成......'.format(file_name))
    
    if __name__ == '__main__':
        list(map(lambda i:write_html2file(getHtml(url_prefix+str(i)+url_subfix,ua_agent=ua_agent),str(i)+url_subfix),[i for i in range(9111123,9111125) if checkPageExists(url_prefix+str(i)+url_subfix,ua_agent=ua_agent)]))
        
    # for i in range(9111123,9111125):
    #     url = url_prefix+str(i)+url_subfix
    #     file_name = str(i)+url_subfix
    #     if checkPageExists(url,ua_agent=ua_agent):
    #         html = getHtml(url,ua_agent=ua_agent)
    #         write_html2file(html,file_name)
    
    import urllib.request as urllib2
    
    import random
    ua_list = [
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
            "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
            "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
        ]
    
    ua_agent = random.choice(ua_list)
    
    # ua_agent_dict = {'User-Agent':ua_agent}
    
    # print(ua_agent_dict)
    # request = urllib2.Request(url=url)
    # request.add_header(**ua_agent_dict)
    
    def checkPageExists(url,ua_agent):
        request = urllib2.Request(url=url)
        request.add_header('User_Agent',ua_agent)
        try:
            code = urllib2.urlopen(request).code
        except IOError as httperr:
            return False
        return True if code == 200 else False
    # checkPageExists('https://www.cnblogs.com/Frank99/p/91111024.html',ua_agent=ua_agent)
    
    # https://www.cnblogs.com/Frank99/p/
    def getHtml(url,ua_agent):
        request = urllib2.Request(url=url)
        request.add_header('User_Agent',ua_agent)
        print('正在从页面 {} 读取数据......'.format(url))
        response = urllib2.urlopen(request)
        print('从页面 {} 读取数据完成......'.format(url))
        return response.read()
    
    def write_html2file(html,file_name):
        with open(file_name,'w',encoding='utf-8') as f:
            print('开始保存文件{}......'.format(file_name))
            f.write(html.decode())
            print('保存文件{}完成......'.format(file_name))
    
    if __name__ == '__main__':
    #     url_prefix = 'https://www.cnblogs.com/Frank99/p/'
    #     url_subfix = '.html'
        url_prefix = input('请输入要被爬取的资源地址前缀...')
        url_subfix = input('请输入要被爬取的资源地址后缀...')
        list(map(lambda i:write_html2file(getHtml(url_prefix+str(i)+url_subfix,ua_agent=ua_agent),str(i)+url_subfix),(i for i in range(5400017,9111125) if checkPageExists(url_prefix+str(i)+url_subfix,ua_agent=ua_agent))))
        
    for i in range(9111123,9111125):
        url = url_prefix+str(i)+url_subfix
        file_name = str(i)+url_subfix
        if checkPageExists(url,ua_agent=ua_agent):
            html = getHtml(url,ua_agent=ua_agent)
            write_html2file(html,file_name)
    

    刚虾米。。。。

    # https://tieba.baidu.com/f?kw=%E5%B8%83%E8%A2%8B%E6%88%8F&ie=utf-8&pn=100
    # https://tieba.baidu.com/f?kw=%E5%B8%83%E8%A2%8B%E6%88%8F&ie=utf-8&pn=10
    import urllib.request as urllib2
    from urllib import parse
    import random
    class TieBa(object):
        def __init__(self,**kw):
            for key in kw:
                if key == 'name':
                    self.__name = kw[key]
                elif key == 'start':
                    self.__start = kw[key]
                elif key == 'end':
                    self.__end = kw[key]
    #             elif key == 'url':
    #                 self.__url = kw[key]
        def set_name(self,name):
            self.__name = name
        def get_name(self):
            return self.__name
        def set_start(self,start):
            self.__start = start
        def get_start(self):
            return self.__start
        def set_end(self,end):
            self.__end = end
        def get_end(self):
            return self.__end
       
        def spider_html(self):
            '''
            爬取网页信息
            '''
            name=self.__name
            start=self.__start
            end=self.__end
            words ={'kw':name}
            name = parse.urlencode(words)
            url_prefix = r'https://tieba.baidu.com/f?'
            url_suffix =r'&ie=utf-8&pn='
            url = url_prefix+name+url_suffix
            start = int(start)
            end = int(end)
            for page in range(start,end):
                url = url+str(page)
                print(url)
                html = self.__get_html(page,url)
                file_name = '{}-{}.html'.format(words['kw'],page)
                self.__write2file(file_name,html)
                
        def __get_html(self,page,url):
            ua_list = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
                        "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
                        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
                        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"]
            request = urllib2.Request(url)
            request.add_header('User-Agent',random.choice(ua_list))
            response = urllib2.urlopen(request)
            print('第 {} 正在爬取'.format(page))
            html = response.read()
            print('第 {} 爬取完成'.format(page))
            return html
        
        def __write2file(self,file_name,html):
            print('开始保存html为文件...')
            with open(file_name,'w',encoding='utf-8') as f:
                f.write(html.decode())
                print('保存html为文件成功...')
    if __name__ =='__main__':
        tb = TieBa()
        tb.set_name(input('请输入贴吧名!'))
        tb.set_start(input('请输入从第几页开始: '))
        tb.set_end(input('请输入从第几页结束: '))
        tb.spider_html()
    
    

    Condition ,用来同步 线程 , 底部还是 Lock , RLock (在一个线程中可以多次重入)

    from threading import (Thread,Condition)
    
    class XiaoAI(Thread):
        def __init__(self,cond,name='小爱'):
            super().__init__(name=name)
            self.cond = cond
    
        def run(self):
            with self.cond:
                self.cond.wait()
                print('{name}: 在'.format(name=self.name))
                self.cond.notify()
    
                self.cond.wait()
                print('{name}: 好啊!'.format(name=self.name))
                self.cond.notify()
    class TianMao(Thread):
        def __init__(self,cond,name='天猫'):
            super().__init__(name=name)
            self.cond = cond
    
        def run(self):
            with cond:
                print('{name}:小爱同学'.format(name=self.name))
                self.cond.notify()
                self.cond.wait()
                print('{name}: 我们来对古诗吧。'.format(name=self.name))
                self.cond.notify()
                self.cond.wait()
    
    
    if __name__ == '__main__':
        cond = Condition()
        xiao = XiaoAI(cond)
        tian = TianMao(cond)
    
        xiao.start()  # 这里 start 顺序千万要注意
        tian.start()
        xiao.join()
        tian.join()
    
    from threading import (Thread,Semaphore)
    from urllib.parse import urlencode
    import requests
    import chardet
    import logging
    from os import path
    import random
    import re
    logging.basicConfig(level=logging.DEBUG)
    # https://tieba.baidu.com/f?kw=%E5%B8%83%E8%A2%8B%E6%88%8F&ie=utf-8&pn=100
    
    class TieBaSpider(Thread):
        def __init__(self,url,sem,name='TieBaSpider'):
            super(TieBaSpider,self).__init__(name=name)
            self.url = url
            self.sem = sem
    
        def _save(self,text):
            parent_dir = r'D:	ieba'
            file_name = path.join(parent_dir,path.split(re.sub(r'[%|=|&|?]','',self.url))[1])+'.html'
            with open(file_name,'w',encoding='utf-8') as fw:
                fw.write(text)
                fw.flush()
            return 1
    
    
        def run(self):
            # ua_list = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
            #            "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
            #            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
            #            "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
            #            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"]
            # header = {'User-Agent':random.choice(ua_list)}
            response = requests.get(self.url)#header=header)
            content = response.content
            logging.info(response.encoding)
            # result = chardet.detect(content)
            # logging.info(result)
            # code = result.get('encoding','utf-8')
            self._save(content.decode(response.encoding))
            self.sem.release()
    
    class UrlProducer(Thread):
        def __init__(self,tb_name,sem,pages_once=3,start_index=1,end_index=9):# end-start % pages_once == 0
            super(UrlProducer,self).__init__(name=tb_name)
            self.tb_name = urlencode(tb_name)
            self.sem = sem
            logging.info(self.tb_name)
            self.pages_once = pages_once
            self.start_index = start_index
            self.end_index = end_index
    
        def run(self):
            for page_idx in range(self.start_index,self.end_index+1):
                self.sem.acquire()
                url_prefix = r'https://tieba.baidu.com/f?'
                url_suffix = r'&fr=ala0&tpl='
                self.url = url_prefix+self.tb_name+url_suffix+str(page_idx)
                tb_spider = TieBaSpider(self.url,self.sem)
                tb_spider.start()
    
    
    if __name__ == '__main__':
        kw_dict = dict(kw=r'国家地理')
        sem = Semaphore(3) # 控制一次并发 3 个线程
        url_producer = UrlProducer(kw_dict,sem=sem)
        url_producer.start()
    
        url_producer.join()
    
    
    

    免费 ip 代理池

    站大爷

    如果有来生,一个人去远行,看不同的风景,感受生命的活力。。。
  • 相关阅读:
    Prometheus监控概述
    Zabbix自带模板监控MySQL
    Zabbix ODBC监控MYSQL
    Zabbix LLD 设置过滤条件,过滤某些item
    zabbix 3.4版本预处理
    zabbix proxy分布式监控配置
    zabbix_proxy.conf配置文件参数说明
    zabbix自定义用户key与参数Userparameters监控脚本输出
    zabbix_server.conf配置文件详解
    zabbixAgent配置文件详解zabbix_agentd.conf
  • 原文地址:https://www.cnblogs.com/Frank99/p/9168012.html
Copyright © 2020-2023  润新知