• 获取百度搜索结果的真实URL、描述、标题


    1、场景

    爬虫练手代码

    2、代码

    Python2:

    #!/usr/bin/python
    # -*- coding:utf-8 -*-
    
    
    import requests
    from lxml import etree
    import BeautifulSoup
    import sys
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def getfromBaidu(word):
        list=[]
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, compress',
            'Accept-Language': 'en-us;q=0.5,en;q=0.3',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
            }
        baiduurl = 'http://www.baidu.com'
        url = 'http://www.baidu.com.cn/s?wd=' + word + '&cl=3'
        html = requests.get(url=url,headers=headers)
        path = etree.HTML(html.content)
        #用k来控制爬取的页码范围
        for k in range(1, 20):
            # 取出内容
            path = etree.HTML(requests.get(url, headers).content)
            flag = 11
            if k == 1:
                flag = 10
            for i in range(1, flag):
                # 获取标题
                sentence = ""
                for j in path.xpath('//*[@id="%d"]/h3/a//text()'%((k-1)*10+i)):
                    sentence+=j
                print sentence                # 打印标题
                # 获取真实URL
                try:
                    url_href =  path.xpath('//*[@id="%d"]/h3/a/@href'%((k-1)*10+i))
                    url_href = ''.join(url_href)
                    baidu_url = requests.get(url=url_href, headers=headers, allow_redirects=False)
                    real_url = baidu_url.headers['Location']  # 得到网页原始地址
                    print real_url               # 打印URL
                except:
                    print "error",sentence,url_href
                # 获取描述
                res_abstract =  path.xpath('//*[@id="%d"]/div[@class="c-abstract"]'%((k-1)*10+i))
                if res_abstract:
                    abstract = res_abstract[0].xpath('string(.)')
                else:
                    res_abstract = path.xpath('//*[@id="%d"]/div/div[2]/div[@class="c-abstract"]' % ((k - 1) * 10 + i))
                    if res_abstract:
                        abstract = res_abstract[0].xpath('string(.)')
                print abstract                  # 打印描述
    
            url = baiduurl+path.xpath('//*[@id="page"]/a[%d]/@href'%flag)[0]
        return
    
    #主程序测试函数
    if __name__ == '__main__':
        print getfromBaidu('上网')
    

    3、效果

  • 相关阅读:
    重写GridView(转载)
    《Windows Communication Foundation之旅》系列之二(转载)
    C++类的继承与多重继承的访问控制(转载)
    准备出发
    10月8日 多云
    081014 曇後雨
    关于SQL Server 2005 Reporting Services的几点设置
    081007 浓雾
    081003 晴
    10月6日 上班
  • 原文地址:https://www.cnblogs.com/17bdw/p/10679884.html
Copyright © 2020-2023  润新知