• 简单爬虫


     一.简单爬虫

    目标——写一个爬虫来爬取常用搜索引擎(比如百度、搜狗等)的首页。

    手段——使用 python 的 requests 、BeautifulSoup4 与 lxml 库。

    实现方式—— 

    首先,我们要获取到网页,这个不难,可以使用 requests.get() 解决,如下:

    # coding = utf-8
    
    import requests
    
    url = "http://www.baidu.com"
    
    #设置获取网页的时间限制,超时就报错,防止网络延时而太久没反馈
    r = requests.get(url, timeout = 1)
    
    #如果返回状态码为 200,则说明网页连接成功
    print(r.status_code)

     在 windows 10 的 cmd 下,运行结果如下

    # encoding = utf-8
    import requests
    from lxml import html
    from bs4 import BeautifulSoup
    
    # 教训不要给文件取名与库名相同,否则会给程序扫描文件出错(找库时出错)
    url = [
        "http://www.baidu.com",
        "http://www.google.com",
        "http://www.sogou.com",
        "http://www.bing.com",
        "http://www.so.com"
    ]
    
    name = ["baidu", "google", "sogou", "bing", "360"]
    '''
    for i in range(5): 
        r = requests.get( url[i], timeout = 10)
        r.encoding = 'utf-8'
        tree = html.fromstring(r.text)
        urls = []
        if r.status_code == 200:
            #create html doc and save it
            with open("D:\\{}.html".format(name[i]), 'w', encoding='utf-8') as f:
                f.write(r.text)
            print("This is {} times: Successful!".format(i))
            for i in tree.xpath("//@href"):
                urls.append(i)
            for i in range(len(urls)):
                print(urls[i])
    
        else:
            print("This is {} times: False!".format(i))
    
    
    for i in range(20):
        r = requests.get(url[0], timeout = 1)
        print(r.status_code)
    r.encoding = 'utf-8'
    
    print("type of text: ", type(r.text))
    print("type of content: ", type(r.content))
    
    soup = BeautifulSoup(r)
    print(soup.get_text())
    #print(r.text)

     处理一个 HTML 文件

    a.打印 body 标签的内容

    b.获取 body 标签的内容

    c.获取 id 为 first 的标签对象

    d.获取并打印 HTML 页面的中文字符

    from lxml import html
    from bs4 import BeautifulSoup

    html_doc = """ <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>菜鸟教程(runoob.com)</title> </head> <body> <h1>我的第一个标题</h1> <p>我的第一个段落。</p> </body> <table border="1"> <tr> <td>row 1, cell 1</td> <td>row 1, cell 2</td> </tr> <tr> <td>row 2, cell 1</td> <td>row 2, cell 2</td> </tr> </table> </html> """ #使用 BeautifulSoup 解析网页,并得到一个 BeautifulSoup 的对象 soup = BeautifulSoup(html_doc) #输出网页源码 print(soup.prettify()) #输出网页中的文本信息 #print(soup.get_text()) text = soup.get_text() print("---------------") #print() #输出网页中文本信息的长度(行数) print(len(soup.contents))

    牛刀小试

    爬取中国大学排名内容,http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html

    爬取2015年的信息。

    # coding = utf-8
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    
    client = MongoClient()
    db = client['UnivRanking']
    collection = db['UnivRanking']
    
    def save_to_mongo(result):
        try:
            if collection.insert(result):
                print('Save to Mongo')
        except:
            print("错误")
    
    allUniv = []
    
    def getHTMLText(url):
        try:
            r = requests.get(url, timeout = 30)
            r.raise_for_status()
            r.encoding = 'utf-8'
            return r.text
        except:
            return ""
    
    def fillUnivList(soup):
        data = soup.find_all('tr')
        for tr in data:
            ltd = tr.find_all('td')
            if len(ltd) == 0:
                continue
    
            singleUniv = []
    
            for td in ltd:
                singleUniv.append(td.string)
    
            allUniv.append(singleUniv)
    
    def printUnivList(num):
        
        print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^8}{6:{0}^8}{7:{0}^8}".format(chr(12288), "排名", "学校名称", "省市", "总分", "人才培养得分", "科学研究得分", "社会服务得分"))
    
        for i in range(num):
            u = allUniv[i]
            print("{1:{0}^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^10}{6:{0}^10}{7:{0}^10}".format(chr(12288), u[0], u[1], u[2], u[3], u[4], u[5], u[6]))
            
        name = ["排名", "学校名称", "省市", "总分", "人才培养得分", "科学研究得分", "社会服务得分"]
        test=pd.DataFrame(columns=name, data= allUniv)
        print(test)
        test.to_csv('testcsv.csv',encoding='utf-8')
        
    
    def main(num):
    
        url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2015_0.html"
    
        html = getHTMLText(url)
        soup = BeautifulSoup(html, "html.parser")
        fillUnivList(soup)
        printUnivList(num)
        save_to_mongo(allUniv)
    
    if __name__ == "__main__":
        
        main(100) 
  • 相关阅读:
    数字形式转换
    货币转换
    温度转换
    volatile 的可见性,禁止指令重排序,无法保证原子性的理解
    mysql索引的结构的分析
    史上最详细的ORACLE19c安装说明
    Solaris 修改联网代理的设置
    Oracle Drop表并未直接删除 drop table xx purge
    oracle自定义函数创建函数索引
    连线法合并两个有序链表
  • 原文地址:https://www.cnblogs.com/justlikecode/p/10907648.html
Copyright © 2020-2023  润新知