• python学习_新闻联播文字版爬虫(V 1.0版)


    python3的爬虫练习,爬取的是新闻联播文字版网站

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    '''
    __author__ = 'wyf349'
    __mtime__ = '2019/12/20'
    '''
    '''第一个版本,用来获取文件的链接和信息,并存储在txt文件中'''
    
    
    #import 相关的库
    import requests
    from bs4 import BeautifulSoup
    import io
    import sys
    import re
    import os
    import time
    
    
    def getRespose(url):
        '''requests获取response文本'''
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}
        try:
            r = requests.get(url, headers=headers, timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            print('链接异常:'+ url)
            return False
    
    
    def getNowUrls(url,mode=1):
        '''解析列表文章的链接和文章名'''
        URL_all_set = set()
        URL_next_page_set = set()
        soup = BeautifulSoup(getRespose(url), 'html.parser')
        if mode == 1 :
            try:
                for line in soup.body.find(class_='xwlist').find_all(name = 'a'):
                    url_point = line.attrs['href']
                    #print(url_point)
                    #print(URL_all)
                    if url_point not in URL_all_set:
                        URL_all_set.add(url_point)
                return URL_all_set
            except:
                print('页面url获取失败,Urls_list')
                return False
        else:
            try:
                url_next = soup.body.find(class_='page now-page').next_sibling.next_sibling.attrs['href']
                if url_next not in URL_next_page_set:
                    URL_next_page_set.add(url_next)
                    return URL_next_page_set
                else:
                    print('链接: ' + url_next + '已存在!')
                    return False
            except:
                print('获取下一页地址失败,Url_next')
                return False
    
    
    def gettext(url):
        try:
            demo = getRespose(url)
            soup_text = BeautifulSoup(demo, 'html.parser')
    
            Text_title = soup_text.head.title.string
            Text_text = soup_text.body.find(attrs={'class':'text_content'}).p.string
            return Text_title, Text_text
        except:
            print('新闻页面解析失败!')
            return False
    
    
    def TextWriter(url, file_path=r'.	emp', file_name=r'新闻联播.txt'):
        file_all = file_path + '\' + file_name
        if  gettext(url):
            Get_text_list = gettext(url)
        if not os.path.exists(file_path):  # os库判断路径是否存在
            os.mkdir(file_path)  # 不存在创建路径
        try:
            with open(file_all, r'a+', encoding="utf8") as f:
                f.write(Get_text_list[0] + '
    ')
                f.write(str(Get_text_list[1]) + '
    ')  # 此处写入失败的原因为该文本为list格式,需要转化为str
                f.flush()  # 将缓存写入
                f.close()
                print('文件写入成功')
        except:
            print('文本写入失败')
            return False
    
    def main(url):
        URL_all = getNowUrls(url,1)
        URL_next_page = getNowUrls(url,2)
        for url_line in list(URL_all):
            TextWriter(url_line, file_path=r'.	emp', file_name=r'新闻联播.txt')
            URL_all.remove(url_line)   
            # print('采集列表:',URL_all)
            # print('下一页:' , URL_next_page)
            if len(URL_all) == 0 and len(URL_next_page) == 1:
                Next_url = list(URL_next_page)[0]
                URL_next_page.remove(Next_url)
                time.sleep(1)
                main(Next_url)
    
    
    
    if __name__ == '__main__':
        url = r'http://www.xwlb.top/xwlb.html'
        main(url)
    

      

  • 相关阅读:
    【NX二次开发】修改dlx对话框标题的方法
    【NX二次开发】导入x_t,UF_PS_import_data
    设置NX欢迎界面
    [转]10个顶级的CSS UI开源框架
    [转] 多线程 《深入浅出 Java Concurrency》目录
    [转] JAVA多线程和并发基础面试问答
    [转]StuQ 技能图谱(全套13张)
    [转] MongoDB shell 操作 (查询)
    搜集好的java技术帖子,持续更新,java程序员的要求
    [转]JAVA程序员一定知道的优秀第三方库(2016版)
  • 原文地址:https://www.cnblogs.com/wyf-349/p/12083981.html
Copyright © 2020-2023  润新知