• <爬虫实战>糗事百科


    1.糗事百科段子.py

    # 目标:爬取糗事百科段子信息(文字)
    # 信息包括:作者头像,作者名字,作者等级,段子内容,好笑数目,评论数目
    # 解析用学过的几种方法都实验一下①正则表达式.②BeautifulSoup③xpath
    import requests
    import re  # 正则表达式
    import json
    from bs4 import BeautifulSoup  # BS
    from lxml import etree  # xpath
    
    
    def get_one_page(url):
    	response = requests.get(url)
    	if response.status_code == 200:
    		return response.text
    	return None
    
    
    def zhengze_parse(html):
    	pattern = re.compile(
    		'<img src="//(.*?)".*?alt="(.*?)".*?<a.*?<div class=".*?">(.*?)</div>'
    		+ '.*?<div class=.*?<span>(.*?)</span>.*?<span class=".*?".*?<i.*?>(.*?)<'
    		+ '.*?<i.*?>(.*?)<',
    		re.S)
    	items = re.findall(pattern, html)
    	for item in items:
    		content = item[3].replace('<br/>', '').strip()
    		content = content.replace('x01', '')
    		if item[5] == '京公网安备11010502031601号':
    			break
    		yield {
    			'image': "http://" + item[0],
    			'name': item[1],
    			'grade': item[2],
    			'content': content,
    			'fun_Num': item[4],
    			'com_Num': item[5]
    		}
    
    
    def soup_parse(html):
    	soup = BeautifulSoup(html, 'lxml')
    	for data in soup.find_all('div', class_='article'):
    		image = "http:" + data.img['src']
    		name = data.img['alt']
    		# 匿名用户没有等级
    		if name=="匿名用户":
    			grade = "匿名用户"
    		else:
    			grade = data.find('div', class_='articleGender').text
    		content = data.find('div', class_='content').span.text.strip()
    		fun_Num = data.find('i', class_='number').text
    		com_Num = data.find('a', class_='qiushi_comments').i.text
    
    		yield {
    			'image': image,
    			'name': name,
    			'grade': grade,
    			'content': content,
    			'fun_Num': fun_Num,
    			'com_Num': com_Num,
    		}
    
    
    def xpath_parse(html):
    	html = etree.HTML(html)
    	for data in html.xpath('//div[@class="col1"]/div'):
    		image = "http:"+ str(data.xpath('.//img/@src')[0])
    		name = data.xpath('.//img/@alt')[0]
    		if name == '匿名用户':
    			grade = '匿名用户'
    		else:
    			grade = data.xpath('./div[1]/div/text()')[0]
    		content = data.xpath('./a/div/span/text()')[0:]
    		content = str(content).strip().replace('\n','')
    		fun_Num = data.xpath('./div[2]/span[1]/i/text()')[0]
    		com_Num = data.xpath('.//div[2]/span[2]/a/i/text()')[0]
    		# print(image, name, grade, content, fun_Num, com_Num)
    		yield {
    			'image': image,
    			'name': name,
    			'grade': grade,
    			'content': content,
    			'fun_Num': fun_Num,
    			'com_Num': com_Num,
    		}
    
    
    def write_to_file(content, flag):
    	with open('糗百段子(' + str(flag) + ').txt', 'a', encoding='utf-8')as f:
    		f.write(json.dumps(content, ensure_ascii=False) + '
    ')
    
    
    def search(Num):
    	url = 'https://www.qiushibaike.com/text/page/' + str(Num) + '/'
    	html = get_one_page(url)
    	# 正则匹配不到匿名用户的等级,不会匹配匿名用户的段子,所以少一些数据
    	# 稍微加个判断逻辑就行了,懒得弄了
    	for item in zhengze_parse(html):
    		write_to_file(item, '正则表达式')
    
    	for item in soup_parse(html):
    		write_to_file(item, 'BS4')
    
    	for item in xpath_parse(html):
    		write_to_file(item, 'xpath')
    	page = str(Num)
    	print("正在爬取第" + page + '页')
    
    
    def main():
    	# 提供页码
    	for Num in range(1, 14):
    		search(Num)
    	print("爬取完成")
    
    
    if __name__ == '__main__':
    	# 入口
    	main()
    

      

    2.打包

    pyinstaller -F 糗事百科段子.py
    

      

    3.运行效果

    网页上匿名用户段子的显示情况

  • 相关阅读:
    ORACLE中order by造成分页不正确原因分析
    各种数据库分页语句整理以及Oracle数据库中的ROWNUM和ORDER BY的区别
    Spring配置文件一直报错的根源所在
    java加密用到了BASE64Decoder时报错信息:Access restriction: The type BASE64Encoder is not accessible due to restrict
    Eclipse报错:An internal error occurred during: "Building workspace". Java heap space),卡死解决办法
    云数据库场景问题经典案例——分页优化
    DDL失败案例
    Java笔试题解答
    模拟购物车表格
    addTodo 模型
  • 原文地址:https://www.cnblogs.com/shuimohei/p/11349322.html
Copyright © 2020-2023  润新知