python3的爬虫练习,爬取的是新闻联播文字版网站
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' __author__ = 'wyf349' __mtime__ = '2019/12/20' ''' '''第一个版本,用来获取文件的链接和信息,并存储在txt文件中''' #import 相关的库 import requests from bs4 import BeautifulSoup import io import sys import re import os import time def getRespose(url): '''requests获取response文本''' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'} try: r = requests.get(url, headers=headers, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: print('链接异常:'+ url) return False def getNowUrls(url,mode=1): '''解析列表文章的链接和文章名''' URL_all_set = set() URL_next_page_set = set() soup = BeautifulSoup(getRespose(url), 'html.parser') if mode == 1 : try: for line in soup.body.find(class_='xwlist').find_all(name = 'a'): url_point = line.attrs['href'] #print(url_point) #print(URL_all) if url_point not in URL_all_set: URL_all_set.add(url_point) return URL_all_set except: print('页面url获取失败,Urls_list') return False else: try: url_next = soup.body.find(class_='page now-page').next_sibling.next_sibling.attrs['href'] if url_next not in URL_next_page_set: URL_next_page_set.add(url_next) return URL_next_page_set else: print('链接: ' + url_next + '已存在!') return False except: print('获取下一页地址失败,Url_next') return False def gettext(url): try: demo = getRespose(url) soup_text = BeautifulSoup(demo, 'html.parser') Text_title = soup_text.head.title.string Text_text = soup_text.body.find(attrs={'class':'text_content'}).p.string return Text_title, Text_text except: print('新闻页面解析失败!') return False def TextWriter(url, file_path=r'. emp', file_name=r'新闻联播.txt'): file_all = file_path + '\' + file_name if gettext(url): Get_text_list = gettext(url) if not os.path.exists(file_path): # os库判断路径是否存在 os.mkdir(file_path) # 不存在创建路径 try: with open(file_all, r'a+', encoding="utf8") as f: f.write(Get_text_list[0] + ' ') f.write(str(Get_text_list[1]) + ' ') # 此处写入失败的原因为该文本为list格式,需要转化为str f.flush() # 将缓存写入 f.close() print('文件写入成功') except: print('文本写入失败') return False def main(url): URL_all = getNowUrls(url,1) URL_next_page = getNowUrls(url,2) for url_line in list(URL_all): TextWriter(url_line, file_path=r'. emp', file_name=r'新闻联播.txt') URL_all.remove(url_line) # print('采集列表:',URL_all) # print('下一页:' , URL_next_page) if len(URL_all) == 0 and len(URL_next_page) == 1: Next_url = list(URL_next_page)[0] URL_next_page.remove(Next_url) time.sleep(1) main(Next_url) if __name__ == '__main__': url = r'http://www.xwlb.top/xwlb.html' main(url)