• 爬虫初学——爬取中国大学排名并存为csv文件


    链接:软科中国最好大学排名2016

    代码:

     1 # -*- coding: utf-8 -*-
     2 """
     3 Created on Mon May 27 21:10:59 2019
     4 
     5 @author: Benny
     6 """
     7 
     8 import csv
     9 import os
    10 import requests
    11 import pandas
    12 from bs4 import BeautifulSoup
    13 allUniv = []
    14 def getHTMLText(url):
    15     try:
    16         r = requests.get(url, timeout=30)
    17         r.raise_for_status()
    18         r.encoding = 'utf-8'
    19         return r.text
    20     except:
    21         return ""
    22 def fillUnivList(soup):
    23     data = soup.find_all('tr')
    24     for tr in data:
    25         ltd = tr.find_all('td')
    26         if len(ltd)==0:
    27             continue
    28         singleUniv = []
    29         for td in ltd:
    30             singleUniv.append(td.string)
    31         allUniv.append(singleUniv)
    32 def writercsv(save_road,num,title):
    33     if os.path.isfile(save_road):
    34         with open(save_road,'a',newline='')as f:
    35             csv_write=csv.writer(f,dialect='excel')
    36             for i in range(num):
    37                 u=allUniv[i]
    38                 csv_write.writerow(u)
    39     else:
    40          with open(save_road,'w',newline='')as f:
    41             csv_write=csv.writer(f,dialect='excel')
    42             csv_write.writerow(title)
    43             for i in range(num):
    44                 u=allUniv[i]
    45                 csv_write.writerow(u)
    46  
    47 title=["排名","学校名称","省市","总分","生源质量","培养结果","科研规模","科研质量","顶尖成果","顶尖人才","科技服务","产学研究合作","成果转化"]
    48 save_road="C:\Users\Benny\Desktop\Python\Python练习sqlit_test02.csv"
    49 def main():
    50     url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
    51     html = getHTMLText(url)
    52     soup = BeautifulSoup(html, "html.parser")
    53     fillUnivList(soup)
    54     writercsv(save_road,10,title)
    55 main()

    文件截屏如下:(这里只是保存了前十名的数据,可以通过更改num来保存更多)

  • 相关阅读:
    centos7启动redis命令
    临时和永久关闭Selinux
    坑人的Mysql5.7 (默认不支持Group By语句)(转)
    修改docker容器参数
    FastDFS常用命令
    SpringBoot集成RabbitMQ消息队列搭建与ACK消息确认入门
    git忽略.idan目录
    springboot2.x接口返回中文乱码
    解决ssh连接linux服务器速度慢
    基于SSD固态硬盘的数据库性能优化
  • 原文地址:https://www.cnblogs.com/shuxincheng/p/10933546.html
Copyright © 2020-2023  润新知