• python3实践-从网站获取数据(Carbon Market Data-BJ) (pandas,bs4)


    自己边看边实践一些简单的实际应用,下面的程序是从某个网站上获取需要的数据。

    在编写的过程中,通过学习陆续了解到一些方法,发现Python真的是很便捷。

    尤其是用pandas获取网页中的表格数据,真的是太方便了!!!

    程序写的可能并不好,但基本上实现了自己的需求。

    希望有高手来指点下~~

    Version 04 (Jan 12 2017)【对于获取表格信息,推荐使用该方法

     1 # Code based on Python 3.x
     2 # _*_ coding: utf-8 _*_
     3 # __Author: "LEMON"
     4 
     5 import pandas as pd
     6 
     7 url2 = 'http://www.bjets.com.cn/article/jyxx/?'
     8 links = []
     9 for n in range(2, 40):
    10     # 页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善
    11     link = url2 + str(n)
    12     links.append(link)
    13 links.insert(0, url2)
    14 
    15 df2 = pd.DataFrame()  # creates a new dataframe that's empty
    16 for url in links:
    17     # 利用pandas获取数据,需要安装 html5lib模块
    18     dfs = pd.read_html(url, header=0)
    19     for df in dfs:
    20         df2= df2.append(df, ignore_index= True)
    21 
    22 # df2.to_excel('MktDataBJ.xlsx') # 将数据存储在excel文件里
    23 df2.to_csv('MktDataBJ-1.csv')  # 将数据存储在csv文件里

    Version 03 (Jan 12 2017)

     1 # Code based on Python 3.x
     2 # _*_ coding: utf-8 _*_
     3 # __Author: "LEMON"
     4 
     5 from bs4 import BeautifulSoup
     6 import requests
     7 import csv
     8 
     9 url2 = 'http://www.bjets.com.cn/article/jyxx/?'
    10 links = []
    11 for n in range(2, 40):
    12     # 页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善
    13     link = url2 + str(n)
    14     links.append(link)
    15 links.insert(0, url2)
    16 
    17 for url in links:
    18     rep = requests.get(url)
    19     # content = rep.text.encode(rep.encoding).decode('utf-8')
    20     # # 直接用requests时,中文内容需要转码
    21 
    22     soup = BeautifulSoup(rep.content, 'html.parser')
    23 
    24     # table = soup.table
    25     table = soup.find('table')  # 两种方式都可以
    26 
    27     trs = table.find_all('tr')
    28     trs2 = trs[1:len(trs)]
    29     list1 = []
    30     for tr in trs2:
    31         td = tr.find_all('td')
    32         row = [i.text for i in td]
    33         list1.append(row)
    34 
    35     with open('MktDataBJ.csv', 'a', errors='ignore', newline='') as f:
    36         f_csv = csv.writer(f)
    37         f_csv.writerows(list1)

    Version 02 (Jan 09 2017)

     1 # Code based on Python 3.x
     2 # _*_ coding: utf-8 _*_
     3 # __Author: "LEMON"
     4 
     5 from bs4 import BeautifulSoup
     6 import requests
     7 import csv
     8 
     9 url2 = 'http://www.bjets.com.cn/article/jyxx/?'
    10 links = []
    11 for n in range(2, 40):
    12     # 页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善
    13     link = url2 + str(n)
    14     links.append(link)
    15 links.insert(0, url2)
    16 # print(links)
    17 
    18 for url in links:
    19     rep = requests.get(url)
    20     # content = rep.text.encode(rep.encoding).decode('utf-8')
    21     # # 直接用requests时,中文内容需要转码
    22 
    23     soup = BeautifulSoup(rep.content, 'html.parser')
    24     body = soup.body
    25     data = body.find('div', {'class': 'list_right'})
    26 
    27     quotes = data.find_all('tr')
    28     quotes1 = quotes[1:len(quotes)]
    29 
    30     list1 = []
    31     for x in quotes1:
    32         list2 = []
    33         for y in x.find_all('td'):
    34             list2.append(y.text)  # 每日的数据做一个单独的list
    35         list1.append(list2)
    36     # print(list1)  # list1为每日数据的总列表
    37     with open('MktDataBJ.csv', 'a', errors='ignore', newline='') as f:
    38         f_csv = csv.writer(f)
    39         f_csv.writerows(list1)

    Version 01 (Jan 08 2017)

     1 # Code based on Python 3.x
     2 # _*_ coding: utf-8 _*_
     3 # __Author: "LEMON"
     4 
     5 from bs4 import BeautifulSoup
     6 import requests
     7 import csv
     8 
     9 urllink = 'http://www.bjets.com.cn/article/jyxx/?'
    10 links = []
    11 for n in range(2, 40):
    12     #页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善
    13     link = urllink + str(n)
    14     links.append(link)
    15 links.insert(0, urllink)
    16 # print(links)
    17 
    18 for url in links:
    19 
    20     rep = requests.get(url)
    21     # content = rep.text.encode(rep.encoding).decode('utf-8')
    22     # # 直接用requests时,中文内容需要转码
    23 
    24     soup = BeautifulSoup(rep.content, 'html.parser')
    25 
    26     # print(soup.prettify())
    27     # # prettify()
    28 
    29     body = soup.body
    30     data = body.find('div', {'class': 'list_right'})
    31 
    32     # table title
    33     titles = data.find_all('th')
    34 
    35     title = []
    36     for x in titles:
    37         title.append(x.text)
    38     # print(title)
    39 
    40     quotes = data.find_all('tr')
    41     quotes1 = quotes[1:len(quotes)]
    42     # print(quotes1)
    43 
    44     list1 = []
    45     for x in quotes1:
    46         for y in x.find_all('td'):
    47             list1.append(y.text)
    48     # print(list1)  # list为每日数据的总列表
    49 
    50     date = []
    51     volumes = []
    52     meanprice = []
    53     totalmoney = []
    54 
    55     for i in range(0, len(list1)):
    56         if i % 4 == 0:
    57             date.append(list1[i])
    58         elif i % 4 == 1:
    59             volumes.append(list1[i])
    60         elif i % 4 == 2:
    61             meanprice.append(list1[i])
    62         else:
    63             totalmoney.append(list1[i])
    64 
    65     # print(date)
    66     # print(volumes)
    67     # print(meanprice)
    68     # print(totalmoney)
    69 
    70     final = []
    71     for i in range(0, len(date)):
    72         temp = [date[i], volumes[i], meanprice[i], totalmoney[i]]
    73         final.append(temp)
    74     # print(final)
    75     with open('bj_carbon.csv', 'a', errors='ignore', newline='') as f:
    76         f_csv = csv.writer(f)
    77         f_csv.writerows(final)
  • 相关阅读:
    Linux熟悉命令
    Spring boot
    python jdbc操作数据库
    python 获取异常
    EntityFramework6 in github
    Java classloader机制测试命令
    SinalR
    asp.net httpmodule问题
    VMVare虚拟机的异常处理---内部错误
    Oracle11g客户端安装及plsql配置
  • 原文地址:https://www.cnblogs.com/lemonbit/p/6262977.html
Copyright © 2020-2023  润新知