• 如何使用BeautifulSoup爬取网页中的需要信息?


    #!/usr/bin/python
    #-*-coding:utf-8-*-       #指定编码格式,python默认unicode编码
    
    import json,os,sys
    from bs4 import BeautifulSoup
    from urllib.request import urlopen
    import re
    import time
    
    from selenium import webdriver
    from selenium.webdriver.firefox.options import Options
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    
    
    def first():
    	binary = r'C:Program FilesMozilla Firefox32firefox.exe'
    	options = Options()
    	options.set_headless(headless=True)
    	options.binary = binary
    	cap = DesiredCapabilities().FIREFOX
    	cap["marionette"] = True #optional
    
    	fp = webdriver.FirefoxProfile()
    	fp.set_preference("permissions.default.stylesheet" ,2)
    	fp.set_preference("permissions.default.image" ,2)
    
    	driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:\Program Files\geckodriver-v0.26.0-win64\geckodriver.exe")
    	return driver
    
    def readlinkfile(sourcename='tangshisanbaishou.txt'):
    	with open(sourcename, 'rt', encoding='utf-8') as f:
    		data = f.read()
    		
    		all_links = []
    		mylinks = re.split('
    ',data)
    
    		for x in mylinks:
    			all_links.append(x)
    	f.close()
    	return all_links
    
    def writeTxtFile(data,outfilename):
    	with open(outfilename+'.txt', 'wt',encoding='utf-8') as f:
    	 	for m in data:
    	 		f.write(m+'
    ')
    	 	f.close()
    
    def writeJsonFile(data,outfilename):
        with open(outfilename+'.json', 'wt',encoding='utf-8') as f:
            for m in data:
                json.dump(m,f,ensure_ascii=False,indent=4)
            f.close()
    
    def writeJsonFileAddEndFile(data,outfilename):
        with open(outfilename+'.json', 'a',encoding='utf-8') as f:
            for m in data:
                json.dump(m,f,ensure_ascii=False,indent=4)
            f.close()
    
    
    
    def action(driver,link):
    	url = "https://****.org" +link
    	driver.get(url)
    
    	booklinks = []
    	elements = driver.find_elements_by_css_selector(".bookcont a")
    
    	title = driver.find_element_by_css_selector("h1").text
    
    	# print(elements.text)
    	for e in elements:
    		# print(e.get_property('href'))
    		booklinks.append(e.get_property('href'))
    
    	writeTxtFile(booklinks,'./onebooklink/'+ title)
    
    
    def createLinkList():
    	driver = first()
    	links = readlinkfile('gujilinks.txt')
    
    	for link in links:
    		action(driver,link)
    		# break
    		
    	driver.quit()
    
    # createLinkList()
    
    
    def soup(gushiurl):
    	# assert gushiurl
    	if not gushiurl:
    		return
    	# gushiurl = str("https://****.org" + gushiurl)
    	print(gushiurl)
    
    	html = urlopen(gushiurl).read().decode('utf-8')
    	# print(html)
    
    	soup = BeautifulSoup(html, features='lxml')
    
    	contsons = soup.find_all('div', {"class": "contson"})
    	title = soup.find_all('h1')
    
    	h1= title[0].get_text().replace('
    译注
    
    ','')
    	text = []
    
    	for item in contsons:
    		text.append(item.get_text())
    
    	temp= {
    		'title':h1,
    		'text':text
    	}
    
    	contents = []
    	soup = None
    	contents.append(temp)
    	return contents
    
    
    path = "G:\workspace\python\selenium\guji\restlinks" #文件夹目录
    Files_Global = []
    
    def file_name_walk(file_dir):
        for files in os.listdir(file_dir):
            Files_Global.append(files)  # 当前路径下所有非目录子文件
    
    
    def getOne(name):
    	links = readlinkfile('./onebooklink/'+name+'.txt')
    	for link in links:
    		contents = soup(link)
    		writeJsonFileAddEndFile(contents,'./gujisourse/'+name)
    
    def getOne2(name):
    	print(name)
    	links = readlinkfile('./restlinks/'+name)
    	index = 0
    	for link in links:
    		time.sleep(0.2)
    		print(index)
    		contents = soup(link)
    		name = name.replace('.txt','')
    		if contents:
    			writeJsonFileAddEndFile(contents,'./gujisourse/'+name)
    			index += 1
    
    def run():
    
    	file_name_walk(path)
    
    	for name in Files_Global:
    		print(name)
    		
    		try:
    			getOne2(name)
    		except Exception as e:
    			raise e
    		
    	
    		# break
    
    # run()
    

      必备技能:

    • python 基础语法
    • BeautifulSoup的帮助文档
    • CSS选择器语法
    • 简单的存读文件方法
  • 相关阅读:
    PHP-会话控制
    PHP-文件上传
    PHP-文档目录
    PHP-正则表达式
    PHP-数组类型
    PHP-函数编程
    PHP-基础知识
    $_FILES系统函数
    话说 MAX_FILE_SIZE
    Hello~! 我的blog
  • 原文地址:https://www.cnblogs.com/xixiaohui/p/12145032.html
Copyright © 2020-2023  润新知