• 爬虫初探-笔趣阁小说下载


    # -*- coding: utf-8 -*-
    """
    Created on Tue Dec  1 12:31:07 2020
    
    @author: zhaolulu
    """
    import pandas as pd
    import requests
    from lxml import etree
    
    
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
    }
    
    def url_read(url):
        try:
            reponse = requests.get(url,headers=headers)
        except:
            print('failed')
        return reponse.content.decode('utf-8')
    
    if __name__=='__main__':
        #笔趣阁小说网站
        url='http://www.xbiquge.la/'
        text = url_read(url)
        print("============================================")
        selector=etree.HTML(text)
        #这个是主页上最新小说的url
        ret=selector.xpath('//*[@id="newscontent"]/div[1]/ul/li/span[2]/a//@href')
        for note_url in ret:
            print(note_url)
        # 这选择了其中一条url 做测试
        #print(ret[0]) # http://www.xbiquge.la/62/62585/
        n_text = url_read('http://www.xbiquge.la/62/62585/')
        n_html=etree.HTML(n_text)
        xpath_ret = n_html.xpath('//*[@id="list"]/dl/dd/a/@href')
        index =0;
        for t_url in xpath_ret:
            #具体的章节内容
            f_url = 'http://www.xbiquge.la'+t_url
            print(f_url)
            article = url_read(f_url)
            article_text=etree.HTML(article)
            article_detail=article_text.xpath('//*[@id="content"]/text()')
            if len(article_detail) > 0:
                pd.Series(article_detail).to_csv('..\book\'+str(index))
                index=index+1
    
    金钱和自由相比,不值一提
  • 相关阅读:
    java 多线程踩过的坑
    css transform旋转属性
    java 实现JSON数据格式化
    shell if判断
    Shell脚本变量判断参数命令
    CentOS7 yum方式安装mysql5.7客户端安装
    sed命令你给删除指定行
    awk查询文件最长或者最短行
    Ansible离线安装
    gitlab的仓库迁移到新的gitlab
  • 原文地址:https://www.cnblogs.com/roadzhao/p/14077774.html
Copyright © 2020-2023  润新知