• python爬虫实践——爬取豌豆荚“休闲益智”游戏app


      1 '''
      2 主页:
      3     图标地址、下载次数、大小、详情页地址
      4 
      5 详情页:
      6     游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、
      7 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
      8 
      9 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
     10 
     11 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
     12 
     13 32
     14 '''
     15 import requests
     16 from bs4 import BeautifulSoup
     17 from pymongo import MongoClient
     18 import re
     19 
     20 #连接mongoDB数据库
     21 client=MongoClient('localhost',27017)
     22 #主页信息
     23 index_col=client['wandoujia']['index']
     24 #详情页信息
     25 detail_col=client['wandoujia']['detail']
     26 
     27 # 1、发送请求
     28 def get_page(url):
     29     response = requests.get(url)
     30     return response
     31 
     32 # 2、开始解析
     33 # 解析详情页
     34 def parse_detail(text):
     35     soup = BeautifulSoup(text, 'lxml')
     36     # print(soup)
     37 
     38     # app名称
     39     try:
     40         name = soup.find(name="span", attrs={"class": "title"}).text
     41     except Exception:
     42         name=None
     43     # print(name)
     44 
     45     # 好评率
     46     try:
     47         love = soup.find(name='span', attrs={"class": "love"}).text
     48     except Exception:
     49         love = None
     50     # print(love)
     51 
     52     # 评论数
     53     try:
     54         commit_num = soup.find(name='a', attrs={"class": "comment-open"}).text
     55     except Exception:
     56         commit_num = None
     57     # print(commit_num)
     58 
     59     # 小编点评
     60     try:
     61         commit_content = soup.find(name='div', attrs={"class": "con"}).text
     62     except Exception:
     63         commit_content = None
     64     # print(commit_content)
     65 
     66     # app下载链接
     67     try:
     68         download_url = soup.find(name='a', attrs={"class": "normal-dl-btn"}).attrs['href']
     69     except Exception:
     70         download_url = None
     71     # print(download_url)
     72 
     73     print('''
     74         ============= tank ==============
     75         app名称:{name}
     76         好评率: {love}
     77         评论数: {commit_num}
     78         小编点评: {commit_content}
     79         app下载链接: {download_url}
     80         ============= end ==============
     81         '''.format(name='name',love='love',commit_num='commit_num',commit_content='commit_content',download_url='download_url')
     82          )
     83 
     84     #判断所有数据都存在,正常赋值
     85     if name and love and commit_num and commit_content and download_url:
     86         detail_data={
     87             'name':name,
     88             'love':love,
     89             'commit_num':commit_num,
     90             'commit_content':commit_content,
     91             'download_url':download_url,
     92         }
     93 
     94     #若love没有值,则设置为  没人点赞,很惨
     95     if not love:
     96         detail_data = {
     97             'name': name,
     98             'love': "没人点赞,很惨",
     99             'commit_num':commit_num,
    100             'commit_content':commit_content,
    101             'download_url':download_url
    102         }
    103 
    104     # 若download_url没有值,则设置为  没有安装包
    105     if not love:
    106         detail_data = {
    107             'name':name,
    108             'love':love,
    109             'commit_num': commit_num,
    110             'commit_content': commit_content,
    111             'download_url': "没有安装包",
    112         }
    113 
    114     #插入详情页数据
    115     detail_col.insert(detail_data)
    116     print('{name}app数据插入成功!')
    117 
    118 
    119 
    120 
    121 # 解析主页
    122 def parse_index(data):
    123     soup = BeautifulSoup(data, 'lxml')
    124 
    125     # 获取所有app的li标签
    126     app_list = soup.find_all(name='li', attrs={"class": "card"})
    127     for app in app_list:
    128         # print(app)
    129         # print('tank' * 1000)
    130         # print('tank *' * 1000)
    131         # print(app)
    132         # 图标地址
    133         # 获取第一个img标签中的data-original属性
    134         img = app.find(name='img').attrs['data-original']
    135         print(img)
    136 
    137         # 下载次数
    138         # 获取class为install-count的span标签中的文本
    139         down_num = app.find(name='span', attrs={"class": "install-count"}).text
    140         print(down_num)
    141 
    142 
    143         # 大小
    144         # 根据文本正则获取到文本中包含 数字 + MB(d+代表数字)的span标签中的文本
    145         size = soup.find(name='span', text=re.compile("d+MB")).text
    146         print(size)
    147 
    148         # 详情页地址
    149         # 获取class为detail-check-btn的a标签中的href属性
    150         # detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href']
    151         # print(detail_url)
    152 
    153         # 详情页地址
    154         detail_url = app.find(name='a').attrs['href']
    155         print(detail_url)
    156 
    157         # 拼接数据
    158         index_data = {
    159             'img': img,
    160             'down_num': down_num,
    161             'size': size,
    162             'detail_url': detail_url,
    163         }
    164 
    165         # 插入数据
    166         index_col.insert(index_data)
    167         print('主页数据插入成功!')
    168 
    169         # 3、往app详情页发送请求
    170         response = get_page(detail_url)
    171 
    172         # 4、解析app详情页
    173         parse_detail(response.text)
    174 
    175 
    176 def main():
    177     for line in range(1, 33):
    178         url = "https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
    179 
    180         # 1、往app接口发送请求
    181         response = get_page(url)
    182         # print(response.text)
    183         print('*' * 1000)
    184         # 反序列化为字典
    185         data = response.json()
    186 
    187         # 获取接口中app标签数据
    188         app_li = data['data']['content']
    189         # print(app_li)
    190         # 2、解析app标签数据
    191         parse_index(app_li)
    192 
    193         #执行完所有函数关闭mongoDB客户端
    194         client.close()
    195 
    196 if __name__ == '__main__':
    197     main()
  • 相关阅读:
    面试官是如何筛选简历?
    成为一名架构师得学习哪些知识?
    一个对话让你明白架构师是做什么的?
    教你一招用 IDE 编程提升效率的骚操作!
    80个让你笑爆肚皮的程序员段子,不好笑算我输!
    Java初学者最佳的学习方法以及会遇到的坑(内含学习资料)!
    作为程序员的你,一年看几本技术相关的书
    MEF 插件式开发之 DotNetCore 中强大的 DI
    MEF 插件式开发之 DotNetCore 初体验
    读 《CSharp Coding Guidelines》有感
  • 原文地址:https://www.cnblogs.com/lweiser/p/11066408.html
Copyright © 2020-2023  润新知