import requests from re import * from bs4 import BeautifulSoup import pandas import sqlite3 #获取一个新闻列表页的所有新闻的上述详情,并包装成一个函数 def getclick(newurl): num=search('_(.*).html',newurl).group(1).split('/')[1] url="http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(num) re2=int(requests.get(url).text.split('.')[-1].lstrip("html('").rstrip("');")) return re2 #获取单条新闻的#标题#链接#时间#来源#内容 #点击次数,并包装成一个函数。 def getcontent(s): for i in s: if len(i.select(".news-list-title"))>0: new={} new["title"]=i.select(".news-list-title")[0].text new["time"]=i.select(".news-list-info")[0].contents[0].text new["content"]=i.select(".news-list-info")[0].contents[1].text new["url"]=i.attrs['href'] url=i.attrs['href'] re1=requests.get(url) re1.encoding="utf-8" soup1=BeautifulSoup(re1.text,'html.parser',from_encoding="utf-8") new["click"]=getclick(url) s1=soup1.select("div") for x in s1: if len(x.select(".show-content"))>0: new["show"]=x.select(".show-content")[0].text #print(new) break return new #传入连接进行爬取 def onepage(urlpage): re=requests.get(urlpage) re.encoding="utf-8" soup=BeautifulSoup(re.text,'html.parser',from_encoding="utf-8") s=soup.select("a") list=[] list.append(getcontent(s)) return list re=requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/") re.encoding="utf-8" soup=BeautifulSoup(re.text,'html.parser',from_encoding="utf-8") s=soup.select("a") list=[] listtotal=[] list.append(getcontent(s)) listtotal.extend(list) #获取所有新闻列表页的网址,调用上述函数。 s=int(soup.select(".a1")[0].text.rstrip('条')) s=s//10+1 for i in range(2,50): urlpage="http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i) listtotal.extend(onepage(urlpage)) df = pandas.DataFrame(listtotal) #print(df.head()) #print(df['title']) df.to_excel('gzccnews.xlsx') with sqlite3.connect('gzccnewsdb.sqlite') as db: df.to_sql('gzccnewsdb',con = db)