• 1.7学习进度


    今日学习2h

    进行了爬虫实操,过程中难点感觉是在xpath的时候网页源代码可以找到,但是网页解析后的代码中找不到相应的位置。

    import re
    import requests
    from fake_useragent import UserAgent
    
    url='https://www.qiushibaike.com/text/'
    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62"
    }
    resp=requests.get(url,headers=headers)
    # print(resp.text)
    contents=re.findall(r'<div class="content">\s*<span>\s*(.+)\s*</span>',resp.text)
    with open("duanzi.txt",'a',encoding='utf-8') as f:
        for info in contents:
            f.write(info+"\n\n")
    段子提取
    from fake_useragent import UserAgent
    import requests
    from lxml import etree
    from time import sleep
    def get_html(url):
        '''
        :param url: 爬取的地址
        :return: 返回html
        '''
        headers={
            "UserAgent":UserAgent().chrome
        }
        resp=requests.get(url,headers=headers)
        sleep(2)
        if resp.status_code==200:
            resp.encoding='utf-8'
            return resp.text
        else:
            return None
    def parse_list(html):
        '''
        :param html: 传递进来一个有电影列表的html
        :return: 返回一个电影列表的url
        '''
        e=etree.HTML(html)
        list_url=['http://maoyan.com{}'.format(url) for url in e.xpath('//div[@class="movie-item film-channel"]/a/@href')]
        return list_url
    def parse_index(html):
        '''
        :param html: 传递一个有电影信息的html
        :return: 已经提取好的电影信息
        '''
        e=etree.HTML(html)
        print(etree.tostring(e).decode())
        name=e.xpath('//h1[@class="name"]/text()')
        type=e.xpath('//li[@class="ellipsis"]/a/text()')
        actors=e.xpath('//ul[@class="celebrity-list clearfix"]/li/div/a/text()')
        actors=format(actors)
        return {"name":name,"type":type,"actors":actors}
    def format_data(actors):
        actor_set=set()
        for actor in actors:
            actor_set.add(actor.strip())
        return actor_set
    def main():
        num=int(input("请输入要获取多少页"))
        for page in range(num):
            url='http://maoyan.com/films?showType=3&offset={}'.format(page*30)
            print(url)
            list_html=get_html(url)
            list_url=parse_list(list_html)
            for url in list_url:
                print(url)
                info_html=get_html(url)
                movie=parse_index(info_html)
                print(movie)
    
    if __name__=='__main__':
        main()
    猫眼电影
    from fake_useragent import UserAgent
    import requests
    from lxml import etree
    
    
    #发送请求
    class Downloader():
        def do_download(self,url):
            print(url)
            headers={
                'User-Agent':UserAgent().chrome
            }
            resp=requests.get(url,headers=headers)
            if resp.status_code==200:
                resp.encoding='utf-8'
                return resp.text
    
    
    #数据解析
    class Parser():
        def do_parse(self,html):
            e=etree.HTML(html)
            contents=[div.xpath('string(.)').strip() for div in e.xpath('//div[@class="content"]')]
            urls=['https://www.qiushibaike.com{}'.format(url) for url in e.xpath('//ul[@class="pagination"]/li/a/@href')]
            return contents,urls
    #数据保存
    class DataOutPut():
        def do_save(self,datas):
            with open('duanzi3.txt','a',encoding='utf-8') as f:
                for data in datas:
                    f.write(data+'\n')
    
    #URL管理器
    class URLManager():
        def __init__(self):
            self.new_url=set()
            self.old_url=set()
        #加入一个url
        def add_new_url(self,url):
            if url is not None and url !='' and url not in self.old_url:
                self.new_url.add(url)
        #加入多个url
        def add_new_urls(self,urls):
            for url in urls:
                self.add_new_url(url)
        #获取一个url
        def get_new_url(self):
            url=self.new_url.pop()
            self.old_url.add(url)
            return url
        #获取还有多少个url要爬取
        def get_new_url_size(self):
            return len(self.new_url)
        #获取是否还有url要爬取
        def is_have_new_url(self):
            return self.get_new_url_size()>0
    
    
    #调度器
    class Scheduler:
        def __init__(self):
            self.downloader=Downloader()
            self.parser=Parser()
            self.data_out_put=DataOutPut()
            self.url_manager=URLManager()
        def start(self,url):
            self.url_manager.add_new_url(url)
            while self.url_manager.is_have_new_url():
                url=self.url_manager.get_new_url()
                html=self.downloader.do_download(url)
                datas,urls=self.parser.do_parse(html)
                self.data_out_put.do_save(datas)
                self.url_manager.add_new_urls(urls)
    if __name__=='__main__':
        scheduler=Scheduler()
        url='https://www.qiushibaike.com/text'
        scheduler.start(url)
    类级别写法
    from threading import Thread
    import requests
    from lxml import etree
    from fake_useragent import UserAgent
    from queue import Queue
    class Spider(Thread):
        def __init__(self,url_queue):
            Thread.__init__(self)
            self.url_queue=url_queue
        def run(self):
            while not self.url_queue.empty():
                url=self.url_queue.get()
                print(url)
                headers={
                    'UserAgent':UserAgent().chrome
                }
                resp=requests.get(url,headers=headers)
                e=etree.HTML(resp.text)
                contents=[div.xpath('string(.)').strip() for div in e.xpath('//div[@class="content"]')]
                # print(contents)
                with open('duanzi2.txt', 'a', encoding='utf-8') as f:
                    for content in contents:
                        f.write(content+'\n')
    if __name__=='__main__':
        base_url='https://www.qiushibaike.com/text/page/{}/'
        url_queue=Queue()
        for num in range(1,6):
            url_queue.put(base_url.format(num))
        for num in range(3):
            spider=Spider(url_queue)
            spider.start()
    多线程
  • 相关阅读:
    linux下php环境的装配以及php storm的链接
    p4 : a problem about "./behavioral-model"
    p4factory下 targets/basic_rout
    P4安装
    第二次结对编程作业——毕业导师智能匹配
    初识GIT
    结对项目之需求分析与原型设计
    调研《构建之法》指导下的全国高校的历届软工实践作品、全国互联网+竞赛、物联网竞赛等各类全国性大学生信息化相关的竞赛平台的历届作品及其看法
    SDN 收集一下最近的资料
    软件工程的实践项目课程的自我目标
  • 原文地址:https://www.cnblogs.com/feng747/p/15583534.html
Copyright © 2020-2023  润新知