自己边看边实践一些简单的实际应用,下面的程序是从某个网站上获取需要的数据。
在编写的过程中,通过学习陆续了解到一些方法,发现Python真的是很便捷。
尤其是用pandas获取网页中的表格数据,真的是太方便了!!!
程序写的可能并不好,但基本上实现了自己的需求。
希望有高手来指点下~~
Version 04 (Jan 12 2017)【对于获取表格信息,推荐使用该方法】
1 # Code based on Python 3.x 2 # _*_ coding: utf-8 _*_ 3 # __Author: "LEMON" 4 5 import pandas as pd 6 7 url2 = 'http://www.bjets.com.cn/article/jyxx/?' 8 links = [] 9 for n in range(2, 40): 10 # 页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善 11 link = url2 + str(n) 12 links.append(link) 13 links.insert(0, url2) 14 15 df2 = pd.DataFrame() # creates a new dataframe that's empty 16 for url in links: 17 # 利用pandas获取数据,需要安装 html5lib模块 18 dfs = pd.read_html(url, header=0) 19 for df in dfs: 20 df2= df2.append(df, ignore_index= True) 21 22 # df2.to_excel('MktDataBJ.xlsx') # 将数据存储在excel文件里 23 df2.to_csv('MktDataBJ-1.csv') # 将数据存储在csv文件里
Version 03 (Jan 12 2017)
1 # Code based on Python 3.x 2 # _*_ coding: utf-8 _*_ 3 # __Author: "LEMON" 4 5 from bs4 import BeautifulSoup 6 import requests 7 import csv 8 9 url2 = 'http://www.bjets.com.cn/article/jyxx/?' 10 links = [] 11 for n in range(2, 40): 12 # 页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善 13 link = url2 + str(n) 14 links.append(link) 15 links.insert(0, url2) 16 17 for url in links: 18 rep = requests.get(url) 19 # content = rep.text.encode(rep.encoding).decode('utf-8') 20 # # 直接用requests时,中文内容需要转码 21 22 soup = BeautifulSoup(rep.content, 'html.parser') 23 24 # table = soup.table 25 table = soup.find('table') # 两种方式都可以 26 27 trs = table.find_all('tr') 28 trs2 = trs[1:len(trs)] 29 list1 = [] 30 for tr in trs2: 31 td = tr.find_all('td') 32 row = [i.text for i in td] 33 list1.append(row) 34 35 with open('MktDataBJ.csv', 'a', errors='ignore', newline='') as f: 36 f_csv = csv.writer(f) 37 f_csv.writerows(list1)
Version 02 (Jan 09 2017)
1 # Code based on Python 3.x 2 # _*_ coding: utf-8 _*_ 3 # __Author: "LEMON" 4 5 from bs4 import BeautifulSoup 6 import requests 7 import csv 8 9 url2 = 'http://www.bjets.com.cn/article/jyxx/?' 10 links = [] 11 for n in range(2, 40): 12 # 页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善 13 link = url2 + str(n) 14 links.append(link) 15 links.insert(0, url2) 16 # print(links) 17 18 for url in links: 19 rep = requests.get(url) 20 # content = rep.text.encode(rep.encoding).decode('utf-8') 21 # # 直接用requests时,中文内容需要转码 22 23 soup = BeautifulSoup(rep.content, 'html.parser') 24 body = soup.body 25 data = body.find('div', {'class': 'list_right'}) 26 27 quotes = data.find_all('tr') 28 quotes1 = quotes[1:len(quotes)] 29 30 list1 = [] 31 for x in quotes1: 32 list2 = [] 33 for y in x.find_all('td'): 34 list2.append(y.text) # 每日的数据做一个单独的list 35 list1.append(list2) 36 # print(list1) # list1为每日数据的总列表 37 with open('MktDataBJ.csv', 'a', errors='ignore', newline='') as f: 38 f_csv = csv.writer(f) 39 f_csv.writerows(list1)
Version 01 (Jan 08 2017)
1 # Code based on Python 3.x 2 # _*_ coding: utf-8 _*_ 3 # __Author: "LEMON" 4 5 from bs4 import BeautifulSoup 6 import requests 7 import csv 8 9 urllink = 'http://www.bjets.com.cn/article/jyxx/?' 10 links = [] 11 for n in range(2, 40): 12 #页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善 13 link = urllink + str(n) 14 links.append(link) 15 links.insert(0, urllink) 16 # print(links) 17 18 for url in links: 19 20 rep = requests.get(url) 21 # content = rep.text.encode(rep.encoding).decode('utf-8') 22 # # 直接用requests时,中文内容需要转码 23 24 soup = BeautifulSoup(rep.content, 'html.parser') 25 26 # print(soup.prettify()) 27 # # prettify() 28 29 body = soup.body 30 data = body.find('div', {'class': 'list_right'}) 31 32 # table title 33 titles = data.find_all('th') 34 35 title = [] 36 for x in titles: 37 title.append(x.text) 38 # print(title) 39 40 quotes = data.find_all('tr') 41 quotes1 = quotes[1:len(quotes)] 42 # print(quotes1) 43 44 list1 = [] 45 for x in quotes1: 46 for y in x.find_all('td'): 47 list1.append(y.text) 48 # print(list1) # list为每日数据的总列表 49 50 date = [] 51 volumes = [] 52 meanprice = [] 53 totalmoney = [] 54 55 for i in range(0, len(list1)): 56 if i % 4 == 0: 57 date.append(list1[i]) 58 elif i % 4 == 1: 59 volumes.append(list1[i]) 60 elif i % 4 == 2: 61 meanprice.append(list1[i]) 62 else: 63 totalmoney.append(list1[i]) 64 65 # print(date) 66 # print(volumes) 67 # print(meanprice) 68 # print(totalmoney) 69 70 final = [] 71 for i in range(0, len(date)): 72 temp = [date[i], volumes[i], meanprice[i], totalmoney[i]] 73 final.append(temp) 74 # print(final) 75 with open('bj_carbon.csv', 'a', errors='ignore', newline='') as f: 76 f_csv = csv.writer(f) 77 f_csv.writerows(final)