• python 爬虫入门


    import requests
    import re
    
    # TODO 下载 每一个小说的首页url
    
    # TODO 大循环
    
    # 1. 下载小说首页
    novel_url = 'http://www.jingcaiyuedu.com/book/15205/list.html'
    response = requests.get(novel_url)
    # 处理字符编码 显式的指定,
    response.encoding = 'utf-8'
    html = response.text # 字符串
    # print(html)
    # 2. 提取 章节url 非贪婪匹配
    title = re.findall(r'<meta name="keywords" content="《(.*?)》',html)[0]
    # print(title)
    # id = list dl 有两个
    dl = re.findall(r'<dl id="list">.*?</dl>',html)[1]
    # print(dl)
    chapter_info_list = re.findall(r'<a.*?href="(.*?)".*?>(.*?)</a>',dl)
    # print(chapter_info_list)
    
    # 数据持久化 写入txt
    fb = open('%s.txt'%title,'w',encoding='utf-8')
    
    # 3. 循环的去访问每个章节,提取内容
    for chapter_info in chapter_info_list:
        chapter_url = chapter_info[0]
        chapter_title = chapter_info[1]
        # 处理 相对url
        if 'http' not in chapter_url:
            chapter_url = 'http://www.jingcaiyuedu.com%s' % chapter_url
        # 下载章节页面
        chapter_response = requests.get(chapter_url)
        chapter_response.encoding = "utf-8"
        chapter_html = chapter_response.text
        # print(chapter_response.text)
        # 提取内容
        chapter_content = re.findall(r'<script>a1();</script>(.*?)<script>a2();</script>',chapter_html)[0]
        # 清洗数据,把多余的字符处理掉
        chapter_content = chapter_content.replace(' ','')
        chapter_content = chapter_content.replace('<br/>','')
        chapter_content = chapter_content.replace('<br>','')
        chapter_content = chapter_content.replace('&nbsp;','')
        # print(chapter_content)
        # 写入文件
        fb.write(chapter_title)
        fb.write('
    ')
        fb.write(chapter_content)
        fb.write('
    ')
        # chapter_response.close()
        print(chapter_url)
    
        # exit()
  • 相关阅读:
    c#大文件上传解决方案支持分片断点上传
    css精灵动画
    文字游戏
    利用myeclipse配置数据库连接池
    python 简单的txt文件读写
    数据库连接池配置
    hdu 1030 Delta-wave
    java jdbc sqlhelper
    js实现页面的自动读秒跳转
    购物车模块
  • 原文地址:https://www.cnblogs.com/stono/p/8861710.html
Copyright © 2020-2023  润新知