1 import requests 2 import re 3 4 url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html' 5 for n in range(1, 2): 6 new_url = url.format(n) 7 html_1 = requests.get(new_url) 8 html_1.encoding = 'gb2312' 9 detil_list = re.findall('<a href="(.*?)" class="ulink">', html_1.text) 10 11 for m in detil_list: 12 b_url = 'http://www.ygdy8.net' + m 13 html_2 = requests.get(b_url) 14 html_2.encoding = 'gb2312' 15 ftp = re.findall('<a href="(.*?)">.*?</a></td>', html_2.text) 16 with open('tddy.txt', 'a', encoding='utf-8') as f: 17 f.write(ftp[0] + ' ')
大学排名练习
1 import bs4 2 import requests 3 from bs4 import BeautifulSoup 4 5 def get_html_text(url): 6 try: 7 r = requests.get(url, timeout=20) 8 r.raise_for_status() 9 r.encoding = r.apparent_encoding 10 return r.text 11 except: 12 return " " 13 14 15 def fill_univ_list(ulist, html): 16 soup = BeautifulSoup(html, "html.parser") 17 for tr in soup.find('tbody').children: 18 if isinstance(tr, bs4.element.Tag): # 判断类型 19 tds = tr('td') 20 ulist.append([tds[0].string, tds[1].string, tds[3].string]) 21 22 23 def print_univ_list(ulist, num): 24 tplt = "{0:^10} {1:{3}^10} {2:^10}" 25 print(tplt.format("排名", "学校名称", "总分", chr(12288))) 26 for i in range(num): 27 u = ulist[i] 28 print(tplt.format(u[0], u[1], u[2], chr(12288))) 29 30 31 def main(): 32 uinfo = [] 33 url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html' 34 html = get_html_text(url) 35 fill_univ_list(uinfo, html) 36 print_univ_list(uinfo, 20) 37 38 39 main()
淘宝商品比价:
1 import requests 2 import re 3 4 def get_html_text(url): 5 try: 6 r = requests.get(url, timeout=30) 7 r.raise_for_status() 8 r.encoding = 'utf-8' 9 return r.text 10 except: 11 return "" 12 13 14 def parse_page(ilt, html): 15 try: 16 plt = re.findall(r'"view_price":"[d.]*"', html) 17 tlt = re.findall(r'"raw_title":".*?"', html) 18 for i in range(len(plt)): 19 price = eval(plt[i].split(':')[1]) 20 title = eval(tlt[i].split(':')[1]) 21 ilt.append([price, title]) 22 except: 23 print("") 24 25 def print_goods_list(ilt): 26 tplt = "{:4} {:8} {:16}" 27 print(tplt.format("序号", "价格", "商品名称")) 28 count = 0 29 for g in ilt: 30 count = count + 1 31 print(tplt.format(count, g[0], g[1])) 32 33 def main(): 34 goods = '减肥餐' 35 depth = 2 36 start_url = 'http://s.taobao.com/search?q=' + goods 37 info_list = [] 38 for i in range(depth): 39 try: 40 url = start_url + '&s=' + str(44*i) 41 html = get_html_text(url) 42 parse_page(info_list, html) 43 except: 44 continue 45 print_goods_list(info_list)
股票数据:
1 import re 2 import traceback 3 4 import requests 5 import sys 6 from bs4 import BeautifulSoup 7 8 9 def get_html_text(url, code='utf-8'): 10 headers ={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} 11 try: 12 r = requests.get(url, timeout=20, headers=headers) 13 r.raise_for_status() 14 r.encoding = code 15 return r.text 16 except: 17 return "" 18 19 def get_stock_list(lst, stock_url): 20 html = get_html_text(stock_url, 'GB2312') 21 soup = BeautifulSoup(html, 'html.parser') 22 a = soup.find_all('a') 23 for i in a: 24 stock_code = re.findall(r'[s][hz]d{6}', str(i)) 25 if len(stock_code) != 0: 26 lst.append(stock_code) 27 28 29 def get_stock_info(lst, stock_url, fpath): 30 count = 0 31 for stock in lst: 32 url = stock_url + stock[0] + '.html' 33 print(url) 34 html = get_html_text(url) 35 try: 36 if html == "": 37 continue 38 info_dict = {} 39 soup = BeautifulSoup(html, 'html.parser') 40 stock_info = soup.find('div', attrs={'class': 'stock-bets'}) 41 info_dict.update({'股票名称': stock_info.text.split()[0]}) 42 43 key_list = stock_info.find_all('dt') 44 value_list = stock_info.find_all('dd') 45 for i in range(len(key_list)): 46 key = key_list[i].text 47 info_dict[key] = value_list[i].text 48 49 with open(fpath, 'a', encoding='utf-8') as f: 50 f.write(str(info_dict) + ' ') 51 count = count + 1 52 print(" 当前进度: {:.2f}%".format(count*100/len(lst), end="")) 53 except: 54 traceback.print_exc(file=sys.stdout) 55 count = count + 1 56 print(" 当前进度: {:.2f}%".format(count * 100 / len(lst), end="")) 57 continue 58 59 def main(): 60 stock_list_url = 'http://quote.eastmoney.com/stocklist.html' 61 stock_info_url = 'http://gupiao.baidu.com/stock/' 62 output_file = 'D:/BaiduStockInfo.txt' 63 slist = [] 64 get_stock_list(slist, stock_list_url) 65 get_stock_info(slist, stock_info_url, output_file)