• 小说爬取 python + urllib + lxml


    from urllib import parse
    from urllib import request
    from lxml import etree
    import time
    
    class Novel:
        def __init__(self,*args):
            self.name = args[0]
            self.dict = args[1]
            self.txt = ''
            for key in sorted(self.dict):
                self.txt = self.txt + self.dict[key]
    
        def write(self):
            f = open(self.name+'.txt','w')
            f.write(self.txt)
            f.close()
    
    #获取网页源代码
    def get_http_page(url,**kw):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
        }
        req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        page = response.read()
        encoding = 'gbk'
        if kw:
            encoding = kw['encoding']
        page = page.decode(encoding)
        return page
    
    #获取漫画目录
    def get_comics_directory(url):
        url_list = []
        page = get_http_page(url,encoding='utf-8')
        html = etree.HTML(page)
        result = html.xpath('/html/body/div[2]/div/div[2]/h3/a')
        elment_select = None
        if len(result):
            url2 = result[0].get('href')
        if url2:
            page = get_http_page(url2)
            html = etree.HTML(page)
            elment_select = html.xpath('/html/body/div[4]/div[9]/span[2]/select')
            if len(elment_select):
                result_option = elment_select[0].findall('option')
                for option in result_option:
                    url_list.append('https://m.wenxuemi6.com{}'.format(option.get('value')))
        return url_list
    
    def downdload_txt(url_list,**kw):
        if kw:
            start = int(kw['start'])
            stop = int (kw['stop'])
            if start >= 0 and start < len(url_list) and stop > start and stop <len(url_list):
                count = kw['start']
                count_max = kw['stop']
        else:
            count = 0
            count_max = len(url_list)
        print('正在爬取目录和章节地址,请稍等……')
        d = {}
        while count < count_max:
            url = url_list[count]
            page = get_http_page(url)
            html = etree.HTML(page)
            result = html.xpath('/html/body/div[4]/ul[2]/li/a')
            txt = ''
            if type(result).__name__ == 'list':
                for l in result:
                    url = 'https://m.wenxuemi6.com{}'.format(l.get('href'))
                    #url_list.append('https://m.wenxuemi6.com{}'.format(l.get('href')))
                    print('Download chapters by URL:{}'.format(url))
                    d2 = {'{}'.format(count): ''}
                    page = get_http_page(url)
                    html = etree.HTML(page)
                    url_next = html.xpath('//*[@id="pb_next"]')
                    t = html.xpath('//*[@id="nr1"]/text()')
                    t2 = html.xpath('//*[@id="nr1"]/p')
                    txt_title = ''
                    txt_title_list = html.xpath('//*[@id="nr_title"]/text()')
                    if type(txt_title_list).__name__ == 'list':
                        if (len(txt_title_list) == 1):
                            txt_title = txt_title_list[0]
                    txt = txt + txt_title + '
    '
                    for l2 in t:
                        txt = txt + l2 + '
    '
                    if type(t2).__name__ == 'list':
                        if len(t2) == 1:
                            url = 'https://m.wenxuemi6.com{}'.format(l.get('href')[:-5] + '_2.html')
                            print('Download chapters by URL:{}'.format(url))
                            page = get_http_page(url)
                            html = etree.HTML(page)
                            t = html.xpath('//*[@id="nr1"]/text()')
                            for l2 in t:
                                txt = txt + l2 + '
    '
                    d2['{}'.format(count)] = txt
                    d.update(d2)
                    time.sleep(1)
        return d
    
    
    
    if __name__ == '__main__':
        txt_name = input("请输入要搜索的书名:")
        url = 'https://m.wenxuemi6.com/search.php?keyword={}'.format(parse.quote(txt_name))
        referer = url
        url_list = get_comics_directory(url)
        #下载第一页目录下的小说
        d = downdload_txt(url_list,start=0,stop=1)
        n1 = Novel(txt_name,d)
        #写出文件 [txt_name].txt 到当前目录下
        n1.write()
    
        #下载全本小说
        d2 = downdload_txt(url_list,start=0,stop=1)
        n2 = Novel(txt_name,d2)
        #写出文件 [txt_name].txt 到当前目录下
        n2.write()
  • 相关阅读:
    google jQuery 1.4.2引用文件,jQuery 1.4.2 引用地址,jQuery引用地址
    html input checkbox js,jQuery
    HTML <fieldset> 标签
    ul 水平,行内块放置,取消点点
    C# Xml 操作
    DropDownList 下拉菜单控件
    jQuery,js : missing)after argument list
    PHP会员权限设计
    主流ETL工具选型
    windows XP下MySQL Cluster集群安装配置 .
  • 原文地址:https://www.cnblogs.com/Dmail/p/11615049.html
Copyright © 2020-2023  润新知