• 获取百度搜索结果的真实url以及摘要和时间


    利用requests库和bs4实现,demo如下:

    #coding:utf-8
    import requests
    from bs4 import BeautifulSoup
    import bs4
    import re
    headers = {
        'User-Agent':'Chrome/68.0.3440.106'
    }
    
    def getHTMLText(url):
        try:
            r=requests.get(url,headers=headers)
            r.raise_for_status()
            r.encoding=r.apparent_encoding
            return r.text
        except:
            return ''
    '''
    def getHref(html):
        soup=BeautifulSoup(html,'lxml')
        for node in soup.find_all()
    '''
    def bdurlCode(url):
        res = requests.get(url, allow_redirects=False)
        Real_url = res.headers['location']
        return Real_url
    
    def fillList(ulist,html):
        soup=BeautifulSoup(html,'lxml')
        for node in soup.find_all('div', {'class': 'result c-container '}):
            abstract_node = node.find('div',{'class':'c-abstract'})
            cite_node = node.find('a', {'class': 'c-showurl'})
            time_node=node.find('span',{'class':' newTimeFactor_before_abs m'})
            if not time_node:continue
            url=cite_node['href']
            url=bdurlCode(url)
            abstract=abstract_node.text
            time=time_node.text
            ulist.append([url,time,abstract])
        print(ulist)
        print(len(ulist))
    
    uinfo=[]
    url="https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%E6%98%8E%E7%95%A5%E6%95%B0%E6%8D%AECTO&oq=%25E6%2598%258E%25E7%2595%25A5%25E6%2595%25B0%25E6%258D%25AE&rsv_pq=9429009d00000f0c&rsv_t=0278viP4h51Y2xMneo8a0HfaOkqnhW8wmti1KAz4ddKuKCUjeKDsh9yB1YM&rqlang=cn&rsv_enter=1&inputT=894&rsv_sug3=17&rsv_sug1=9&rsv_sug7=100&rsv_sug2=0&rsv_sug4=1273&rsv_sug=1"
    html=getHTMLText(url)
    fillList(uinfo,html)
  • 相关阅读:
    Git push 常见用法
    Git commit 常见用法
    Git add 常见用法
    Git-仓库
    Git clone 常见用法
    Git-简介
    ZOJ Problem Set
    ZOJ Problem Set
    ZOJ Problem Set
    ZOJ Problem Set
  • 原文地址:https://www.cnblogs.com/elpsycongroo/p/9455703.html
Copyright © 2020-2023  润新知