import requests import re from bs4 import BeautifulSoup def uslHtml(url): res = requests.get(url) res.encoding = "utf-8" soup = BeautifulSoup(res.text, "html.parser") return soup def page(url): soup = uslHtml(url) newsList = soup.select(".news-list")[0].select("li") for aList in newsList: a = aList.select("a")[0].attrs["href"] number = re.search("_(d+)/(d+)",a).group(0) ress = requests.get(a) ress.encoding="utf-8" soup1 = BeautifulSoup(ress.text,"html.parser") content = soup1.select("#content")[0].text f.write(content) print(content) so = uslHtml("http://news.gzcc.cn/html/xiaoyuanxinwen/") n = int(so.select("#pages")[0].select(".a1")[0].text.strip("条")) n=int(n/10)+1 f = open("SchoolNews.txt","a+",encoding='utf-8') page("http://news.gzcc.cn/html/xiaoyuanxinwen/") for i in range(2,3): url1 = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i); page(url1) f.close()
2.
def Url(newsUrl): re = requests.get(newsUrl) re.encoding="utf-8" soup = BeautifulSoup(re.text,"html.parser") return soup newsArr= [] def pageNumber(): soup = Url("http://news.gzcc.cn/html/xiaoyuanxinwen/") newsPage = int(soup.select("#pages")[0].select(".a1")[0].text.rstrip("条")) newsPage = int((newsPage/10)+1) return newsPage def site(): newsPage = pageNumber() for i in (newsPage,newsPage+1): othersUrl = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i) soup = Url(othersUrl) News(soup) def News(soup): newsList = soup.select(".news-list-text") for i in range(len(newsList)): dict = {} news = newsList[i] newsTitle = news.select(".news-list-title")[0].text dict["title"] = newsTitle newsDescription = news.select(".news-list-description")[0].text dict["description"] = newsDescription newsArr.insert(len(newsArr),dict) print(newsArr) site()
3.
df = pandas.DataFrame(newsArr) df.to_excel("title.xlsx")
print(df[['click', 'title', 'sources']].head(6)) print(df[(df['click'] > 3000) & (df['sources'] == '学校综合办')]) sou = ['国际学院', '学生工作处'] print(df[df['sources'].isin(sou)])