爬取豌豆荚排行
使用bs4解析数据,并保存到mysql
import requests
from bs4 import BeautifulSoup
import pymysql
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
#获取app的detail的url
def get_urls(url):
page = requests.get(url=url,headers=headers).text
soup = BeautifulSoup(page,'lxml')
urls = soup.select('.icon-wrap>a')
return urls
def get_detail(url,cursor):
page = requests.get(url=url,headers=headers).text
soup = BeautifulSoup(page,'lxml')
img = soup.select('.app-icon>img')[0]['src']
name = soup.select('.app-name>span')[0].text
count = soup.select('.app-info-data>span>i')[0].text
ping_size = soup.select('.app-info-data>.love>i')[0].text
ping_num = soup.select('.app-info-data>.comment-open>i')[0].text
print(1)
sql = 'insert into spider values (%s,%s,%s,%s,%s)'
cursor.execute(sql,(img,name,count,ping_size,ping_num))
conn.commit()
if __name__ == '__main__':
url_home = 'https://www.wandoujia.com/top/game'
urls = get_urls(url_home)
# print(soup.prettify())
conn = pymysql.connect(user='root',passwd='root',database='t5')
cursor = conn.cursor(pymysql.cursors.DictCursor)
for i in urls:
get_detail(i['href'],cursor)
cursor.close()
conn.close()