• 内涵段子——脑筋急转弯——spider


    # python 3.7
    from urllib.request import Request,urlopen
    import re,time
    
    class Neihan(object):
        def __init__(self):
            self.header={
                'Host': 'www.neihan8.com',
                'Referer': 'https: // www.neihan8.com / njjzw //',
                'Upgrade - Insecure - Requests': 1,
                'User - Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
                "Cookie": 'UM_distinctid=1673e837ae7146-0363c5477e0b8a-424f0928-13c680-1673e837ae9355; CNZZDATA1274349754=965294396-1542939999-%7C1542939999; Hm_lvt_94f4eb93f17efa632a5c8a01b23da410=1542942067; npreuecookieclassrecord=%2C2%2C14%2C1%2C; CNZZDATA5804950=cnzz_eid%3D222162018-1542942068-https%253A%252F%252Fwww.neihan8.com%252F%26ntime%3D1542942068; Hm_lpvt_94f4eb93f17efa632a5c8a01b23da410=1542943190'
            }
            self.static = 'https://www.neihan8.com/njjzw/'
    
        def getPage(self,url,refer=None):
            res = urlopen(Request(url=url,headers=self.header)).read()
            self.parsePage(res.decode(),refer)
    
        def parsePage(self,htmlres,*args):
            patten = 'class="title" title=".*?">(.*?)</a></h3>s+<div class="desc">(.*?)</div>'
            p = re.findall(patten,htmlres)
            self.writePge(p,args)
    
        def writePge(self,p,*args):
    
            with open('11.txt','a+',encoding='utf8') as f:
                print(args)
                for i in p:
                    if args[0][0] is not None:
                        print(args)
                        f.write('问题:'+i[0]+'
    '+args[0][0]+i[1].strip()+'
    ')
                    else:
                        f.write('问题:'+i[0]+'
    '+i[1].strip()+'
    ')
                    f.write('
    ')
    
        def workon(self):
            # 爬取 20 页
            for i in range(1,10):
                if i == 1:
                    url = self.static
                    self.getPage(url, refer='答案:')
                else:
                    url = self.static+'index_%s.html'%i
                    self.getPage(url)
    
                time.sleep(2)
    
    if __name__ == '__main__':
        spider  = Neihan()
        spider.workon()
  • 相关阅读:
    How a webpage is loaded and displayed
    Tree知识总结
    Install Cassandra Locally
    axios接口封装
    Jsonp解决跨域问题
    react使用swiper,解决添加点击事件首位图片点击失效,解决轮播按钮被覆盖问题
    vue 生产环境和测试环境的配置
    vue使用远程在线更新代码
    vue.js axios实现跨域http请求接口
    leetcode每日一题(2020-05-27):974. 和可被 K 整除的子数组
  • 原文地址:https://www.cnblogs.com/Skyda/p/10006672.html
Copyright © 2020-2023  润新知