• url请求,request请求,解析库beauitfulsoup,解析库lxml


    url请求

    1 from urllib.request import urlopen
    2 url="****"
    3 respones = urlopen(url)
    4 content =  respones.read()
    5 content = content.decode('utf-8')
    6 print(content)

    request请求

     1 import requests
     2 url="***"
     3 headers = {'Accept': '*/*',
     4                'Accept-Language': 'en-US,en;q=0.8',
     5                'Cache-Control': 'max-age=0',
     6                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
     7                'Connection': 'keep-alive',
     8                'Referer': 'http://www.baidu.com/'
     9                }
    10 res = requests.get (url,headers=headers)#加headers头是为了伪装成浏览器浏览网页
    11 print(res.status_code)#打印状态码
    12 print(res.text)#打印文本
    13 print(res.content)#打印图片或者视频文本都可以

    解析库BeautifulSoup

     1 import requests
     2 from bs4 import BeautifulSoup
     3 url="http://news.qq.com/"
     4 headers = {'Accept': '*/*',
     5                'Accept-Language': 'en-US,en;q=0.8',
     6                'Cache-Control': 'max-age=0',
     7                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
     8                'Connection': 'keep-alive',
     9                'Referer': 'http://www.baidu.com/'
    10                }
    11 res = requests.get (url,headers=headers)#加headers头是为了伪装成浏览器浏览网页
    12 Soup = BeautifulSoup(res.text.encode("utf-8"),'lxml')
    13 en = Soup.find_all('en',attrs={'class':'f14 124'})
    14 for i in en:
    15     title = i.a.get_text()
    16     link = i.a['href']
    17     print({
    18         '标题':title,
    19         '链接':link
    20     })

    解析库lxml,xpath表达法

     1 import requests
     2 from lxml import etree
     3 url="http://news.qq.com/"
     4 headers = {'Accept': '*/*',
     5                'Accept-Language': 'en-US,en;q=0.8',
     6                'Cache-Control': 'max-age=0',
     7                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
     8                'Connection': 'keep-alive',
     9                'Referer': 'http://www.baidu.com/'
    10                }
    11 html = requests.get (url,headers=headers)#加headers头是为了伪装成浏览器浏览网页
    12 con  = etree.HTML(html.text)
    13 title = con.xpath('//en[@class="f14 124"]/a/text()')
    14 link = con.xpath('//en[@class="f14 124"]/a/@href')
    15 for i in zip(title,link):
    16     print({
    17         '标题':i[0],
    18         '链接':i[1]
    19     })

    selesct方法

     1 import requests
     2 from bs4 import BeautifulSoup
     3 url="http://news.qq.com/"
     4 headers = {'Accept': '*/*',
     5                'Accept-Language': 'en-US,en;q=0.8',
     6                'Cache-Control': 'max-age=0',
     7                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
     8                'Connection': 'keep-alive',
     9                'Referer': 'http://www.baidu.com/'
    10                }
    11 res = requests.get (url,headers=headers)#加headers头是为了伪装成浏览器浏览网页
    12 Soup = BeautifulSoup(res.text.encode("utf-8"),'lxml')
    13 en = Soup.select('en[class="f12 124"] a')
    14 for i in en:
    15     title = i.a.get_text()
    16     link = i.a['href']
    17     print({
    18         '标题':i[0],
    19         '链接':i[1]
    20     })
  • 相关阅读:
    NOIP2017 时间复杂度 大模拟
    【Python】CV2的一些基本操作
    【Python】类、对象、self
    【Ubuntu18.04】清空回收站
    小飞机可以解决git clone没有返回的问题吗?
    sqlserver2005 远程服务器数据 完全拷贝 到本地数据库
    Microsoft Visual Studio 2005 多线程时 解决不是该线程创建的来访问
    MS SqL2000 数据库置疑状态的解决方法[转]
    vue 函数配置项watch以及函数 $watch 源码分享
    Vue生命周期之beforeCreate vue 生命周期详解
  • 原文地址:https://www.cnblogs.com/ZHANG576433951/p/11152616.html
Copyright © 2020-2023  润新知