• 爬虫入门


    1、搜索结果爬取(未解析)

    #coding:utf-8
    import requests
    url="http://www.baidu.com/s"
    try:
        #kv={'user-agent':'Mozilla/5.0'}
        kv={'wd':'Python'}
        r=requests.get(url,params=kv)
        print r.status_code
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        print len(r.text)
    except:
        print '产生异常'

    2、爬取图片

    #coding:utf-8
    import requests
    import os
    url="http:image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg"
    root="/Users/wangkun/Desktop/DEMO1/"
    path=root+url.split('/')[-1]
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            r=requests.get(url)
            with open(path,'wb') as f:
                f.write(r.content)
                f.close()
                print '文件保存成功'
        else:
            print '文件已存在'
    except:
        print '爬取失败'

    3、IP归属地查询

    #coding:utf-8
    import requests
    url="http://m.ip138.com/ip.asp?ip="
    try:
        r=requests.get(url+'202.204.80.112')
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        print r.text[-500:]
    except:
        print '爬取失败'

     4、beautifulsoup解析

    import requests
    from bs4 import BeautifulSoup
    r=requests.get('http://python123.io/ws/demo.html')
    demo=r.text
    soup=BeautifulSoup(demo,'html.parser')
    soup.prettify()#
    for link in soup.find_all('a'):
        print(link.get('href'))
    
    print(soup.a.next_siblings)

     5、爬取并解析大学排名,print输出(数据结构结果)

    #coding:utf-8
    import requests
    from bs4 import BeautifulSoup
    import bs4
    import re
    def getHTMLText(url):
        try:
            r=requests.get(url,timeout=30)
            r.raise_for_status()
            r.encoding=r.apparent_encoding
            #print(r.text)
            return r.text
        except:
            return ''
    
    
    def fillList(ulist,html):
        soup=BeautifulSoup(html,'html.parser')
        for tr in soup.find('tbody').children:
            if isinstance(tr,bs4.element.Tag):
                tds=tr('td')
                ulist.append([tds[0].string,tds[1].string,tds[3].string])
        #print(ulist)
    
    def printList(ulist,num):
        print("{:^10}	{:^6}	{:^10}".format('排名','学校名称','总分'))
        for i in range(num):
            u=ulist[i]
            print("{:^10}	{:^6}	{:^10}".format(u[0], u[1], u[2]))
        return("Suc"+str(num))
    
    
    uinfo=[]
    url="http://www.zuihaodaxue.cn/shengyuanzhiliangpaiming2018.html"
    html=getHTMLText(url)
    fillList(uinfo,html)
    printList(uinfo,20) #20所大学的信息
  • 相关阅读:
    MYSQL判断某个表是否已经存在
    百度、雅虎、谷歌搜索引擎接口调用注意事项
    Codeigniter整合Tank Auth权限类库的教程
    短链接的生成算法
    自定义String
    运算符和结合性
    字符串类封装
    运算符重载
    数组类封装
    友元
  • 原文地址:https://www.cnblogs.com/elpsycongroo/p/9452269.html
Copyright © 2020-2023  润新知