1 ''' 2 主页: 3 图标地址、下载次数、大小、详情页地址 4 5 详情页: 6 游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、 7 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B 8 9 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B 10 11 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B 12 13 32 14 ''' 15 import requests 16 from bs4 import BeautifulSoup 17 from pymongo import MongoClient 18 import re 19 20 #连接mongoDB数据库 21 client=MongoClient('localhost',27017) 22 #主页信息 23 index_col=client['wandoujia']['index'] 24 #详情页信息 25 detail_col=client['wandoujia']['detail'] 26 27 # 1、发送请求 28 def get_page(url): 29 response = requests.get(url) 30 return response 31 32 # 2、开始解析 33 # 解析详情页 34 def parse_detail(text): 35 soup = BeautifulSoup(text, 'lxml') 36 # print(soup) 37 38 # app名称 39 try: 40 name = soup.find(name="span", attrs={"class": "title"}).text 41 except Exception: 42 name=None 43 # print(name) 44 45 # 好评率 46 try: 47 love = soup.find(name='span', attrs={"class": "love"}).text 48 except Exception: 49 love = None 50 # print(love) 51 52 # 评论数 53 try: 54 commit_num = soup.find(name='a', attrs={"class": "comment-open"}).text 55 except Exception: 56 commit_num = None 57 # print(commit_num) 58 59 # 小编点评 60 try: 61 commit_content = soup.find(name='div', attrs={"class": "con"}).text 62 except Exception: 63 commit_content = None 64 # print(commit_content) 65 66 # app下载链接 67 try: 68 download_url = soup.find(name='a', attrs={"class": "normal-dl-btn"}).attrs['href'] 69 except Exception: 70 download_url = None 71 # print(download_url) 72 73 print(''' 74 ============= tank ============== 75 app名称:{name} 76 好评率: {love} 77 评论数: {commit_num} 78 小编点评: {commit_content} 79 app下载链接: {download_url} 80 ============= end ============== 81 '''.format(name='name',love='love',commit_num='commit_num',commit_content='commit_content',download_url='download_url') 82 ) 83 84 #判断所有数据都存在,正常赋值 85 if name and love and commit_num and commit_content and download_url: 86 detail_data={ 87 'name':name, 88 'love':love, 89 'commit_num':commit_num, 90 'commit_content':commit_content, 91 'download_url':download_url, 92 } 93 94 #若love没有值,则设置为 没人点赞,很惨 95 if not love: 96 detail_data = { 97 'name': name, 98 'love': "没人点赞,很惨", 99 'commit_num':commit_num, 100 'commit_content':commit_content, 101 'download_url':download_url 102 } 103 104 # 若download_url没有值,则设置为 没有安装包 105 if not love: 106 detail_data = { 107 'name':name, 108 'love':love, 109 'commit_num': commit_num, 110 'commit_content': commit_content, 111 'download_url': "没有安装包", 112 } 113 114 #插入详情页数据 115 detail_col.insert(detail_data) 116 print('{name}app数据插入成功!') 117 118 119 120 121 # 解析主页 122 def parse_index(data): 123 soup = BeautifulSoup(data, 'lxml') 124 125 # 获取所有app的li标签 126 app_list = soup.find_all(name='li', attrs={"class": "card"}) 127 for app in app_list: 128 # print(app) 129 # print('tank' * 1000) 130 # print('tank *' * 1000) 131 # print(app) 132 # 图标地址 133 # 获取第一个img标签中的data-original属性 134 img = app.find(name='img').attrs['data-original'] 135 print(img) 136 137 # 下载次数 138 # 获取class为install-count的span标签中的文本 139 down_num = app.find(name='span', attrs={"class": "install-count"}).text 140 print(down_num) 141 142 143 # 大小 144 # 根据文本正则获取到文本中包含 数字 + MB(d+代表数字)的span标签中的文本 145 size = soup.find(name='span', text=re.compile("d+MB")).text 146 print(size) 147 148 # 详情页地址 149 # 获取class为detail-check-btn的a标签中的href属性 150 # detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href'] 151 # print(detail_url) 152 153 # 详情页地址 154 detail_url = app.find(name='a').attrs['href'] 155 print(detail_url) 156 157 # 拼接数据 158 index_data = { 159 'img': img, 160 'down_num': down_num, 161 'size': size, 162 'detail_url': detail_url, 163 } 164 165 # 插入数据 166 index_col.insert(index_data) 167 print('主页数据插入成功!') 168 169 # 3、往app详情页发送请求 170 response = get_page(detail_url) 171 172 # 4、解析app详情页 173 parse_detail(response.text) 174 175 176 def main(): 177 for line in range(1, 33): 178 url = "https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B" 179 180 # 1、往app接口发送请求 181 response = get_page(url) 182 # print(response.text) 183 print('*' * 1000) 184 # 反序列化为字典 185 data = response.json() 186 187 # 获取接口中app标签数据 188 app_li = data['data']['content'] 189 # print(app_li) 190 # 2、解析app标签数据 191 parse_index(app_li) 192 193 #执行完所有函数关闭mongoDB客户端 194 client.close() 195 196 if __name__ == '__main__': 197 main()