0.从新闻url获取点击次数,并整理成函数
- newsUrl
- newsId(re.search())
- clickUrl(str.format())
- requests.get(clickUrl)
- re.search()/.split()
- str.lstrip(),str.rstrip()
- int
- 整理成函数
- 获取新闻发布时间及类型转换也整理成函数
代码展示:
#获取新闻详情 def annews(url): news={} res=requests.get(url) res.encoding='utf-8' soup=bs4.BeautifulSoup(res.text,'html.parser') news['title']=soup.select(".show-title")[0].text news['date']=getTime(soup.select(".show-info")[0]) news['content']=soup.select("#content")[0].text.strip() news['count']=getCount(url) return news #获取点击数 def getCount(url): newsId = re.search('/(d+).html', url).groups(0)[0] countHtml=requests.get("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(newsId)) count=int(re.findall("d+",countHtml.text.split(".html")[-1])[0]) return count #获取时间 def getTime(str): infos=str.text.split() datestr=infos[0]+' '+infos[1] return datetime.strptime(datestr,'发布时间:%Y-%m-%d %H:%M:%S') #获取指定新闻页范围的url def getNsUrl(url,start,end): urls=[] for i in range(start,end): urls.append("{}/{}.html".format(url,i)) return urls #获取新闻页的新闻url def getNewsUrl(url): newsUrlList=[] res=requests.get(url) res.encoding='utf-8' soup=bs4.BeautifulSoup(res.text,'html.parser') alist=soup.select(".news-list a") for i in alist: newsUrlList.append(i['href']) return newsUrlList urls=getNsUrl('http://news.gzcc.cn/html/xiaoyuanxinwen',66,76) newsList=[] for page in urls: for url in getNewsUrl(page): time.sleep(1) newsList.append(annews(url)) df=pd.DataFrame(newsList) df.to_csv("news.csv",encoding='utf-8')
运行截图: