import requests from bs4 import BeautifulSoup from datetime import datetime import re import pandas news_list = [] def crawlOnePageSchoolNews(page_url): res0 = requests.get(page_url) res0.encoding = 'UTF-8' soup0 = BeautifulSoup(res0.text, 'html.parser') news = soup0.select('.news-list > li') for n in news: # print(n) print('**' * 5 + '列表页信息' + '**' * 10) print('新闻链接:' + n.a.attrs['href']) print('新闻标题:' + n.select('.news-list-title')[0].text) print('新闻描述:' + n.a.select('.news-list-description')[0].text) print('新闻时间:' + n.a.select('.news-list-info > span')[0].text) print('新闻来源:' + n.a.select('.news-list-info > span')[1].text) news = getNewDetail(n.a.attrs['href']) news['标题'] = n.select('.news-list-title')[0].text news_list.append(news) return news_list def getNewDetail(href): print('**' * 5 + '详情页信息' + '**' * 10) print(href) res1 = requests.get(href) res1.encoding = 'UTF-8' soup1 = BeautifulSoup(res1.text, 'html.parser') news = {} if soup1.select('#content'): news_content = soup1.select('#content')[0].text news['内容'] = news_content print(news_content) # 文章内容 else: news['内容'] = '' if soup1.select('.show-info'): # 防止之前网页没有show_info news_info = soup1.select('.show-info')[0].text else: return news info_list = ['来源', '发布时间', '点击', '作者', '审核', '摄影'] # 需要解析的字段 news_info_set = set(news_info.split('xa0')) - {' ', ''} # 网页中的 获取后会解析成xa0,所以可以使用xa0作为分隔符 # 循环打印文章信息 for n_i in news_info_set: for info_flag in info_list: if n_i.find(info_flag) != -1: # 因为时间的冒号采用了英文符所以要进行判断 if info_flag == '发布时间': # 将发布时间字符串转为datetime格式,方便日后存储到数据库 release_time = datetime.strptime(n_i[n_i.index(':') + 1:], '%Y-%m-%d %H:%M:%S ') news[info_flag] = release_time print(info_flag + ':', release_time) elif info_flag == '点击': # 点击次数是通过文章id访问php后使用js写入,所以这里单独处理 news[info_flag] = getClickCount(href) else: news[info_flag] = n_i[n_i.index(':') + 1:] print(info_flag + ':' + n_i[n_i.index(':') + 1:]) print('————' * 40) return news def getClickCount(news_url): click_num_url = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80' click_num_url = click_num_url.format(re.search('_(.*)/(.*).html', news_url).group(2)) res2 = requests.get(click_num_url) res2.encoding = 'UTF-8' click_num = re.search("$('#hits').html('(d*)')", res2.text).group(1) print('点击:' + click_num) return click_num print(crawlOnePageSchoolNews('http://news.gzcc.cn/html/xiaoyuanxinwen/')) pageURL = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html' res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/') res.encoding = 'UTF-8' soup = BeautifulSoup(res.text, 'html.parser') newsSum = int(re.search('(d*)条', soup.select('a.a1')[0].text).group(1)) if newsSum % 10: pageSum = int(newsSum / 10) + 1 else: pageSum = int(newsSum / 10) for i in range(2, pageSum + 1): crawlOnePageSchoolNews(pageURL.format(i)) dit = pandas.DataFrame(news_list) dit.to_excel('test.xlsx') dit.to_csv('test.csv') print(dit[['作者', '来源']][:6]) print(dit[(dit['来源'] == '学校综合办') & (dit['点击'] > 3000)]) print(dit[dit['来源'].isin(['国际学院', '学生工作处'])])