时间:1个小时左右
代码:200行左右
博客:1
知识点:Python的数据分析
今天学习了关于Python的爬虫操作:
from builtins import print from bs4 import BeautifulSoup import requests import random import re url = "http://wsjkw.henan.gov.cn/ztzl/xxgzbdfyyqfk/yqtb/" # us_list = { # # } # user_agent = random.choice(us_list) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36" } resp = requests.get(url=url, headers=headers) resp.encoding = "utf-8" html = resp.text print(html) # 把html文件转换为bs4对象 bs = BeautifulSoup(html) content = bs.find_all("a", attrs={"target": "_blank"}) # 使用 attrs过滤 url_list = [] for item in content: print(item.string) print(item["href"]) url_list.append(item['href']) print(url_list) url_new = "http://wjw.beijing.gov.cn/wjwh/ztzl/xxgzbd/gzbdyqtb/202104/t20210401_2341891.html" #url_new = "http://wsjkw.henan.gov.cn/2021/04-01/2118705.html" resp = requests.get(url_new, headers) html = resp.content print(html) bs = BeautifulSoup(html, features="html.parser") result = bs.find_all("p") c_list = [] for item in result: print(item.string) c_list.append(item.string) print(c_list) print("***********") # 累计报告本地确诊病例1273例,其中死亡病例22例,出院病例1251例。 patten = "确诊病例(d+)例.*?死亡病例(d+)例.*?出院病例(.*?)例" result = re.search(patten, str(c_list)) print("***********") print(result) print(result.group()) print(result.groups()) print(result.group(1)) print("***********") print(c_list)