爬取要求:
- 循环爬取每个网页的标题内容
- 将爬取的网页内容保存到本地
方法:
- 前期准备
1.复制粘贴第1页,第2页,第3页的网址;
2.观察网址,总结规律;
- 后期实施
1.urllib.request爬取网页.
2.正则表达式结合bs4从爬取的网页中获取标题.
3.将获取的标题保存到本地.
import urllib.request import re from bs4 import BeautifulSoup def get_html_text(url, data_list, depth): # 构造请求头 hd = ('User-Agent', 'Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Ch rome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0') # 创建opener对象 opener = urllib.request.build_opener() opener.addheaders = [hd] # 将opener对象设置为全局 urllib.request.install_opener(opener) for i in range(1, depth): try: # 构造网址 url_depth = url + str(i) # 获取数据 data = urllib.request.urlopen(url_depth).read().decode('utf-8', 'ignore') data_list.append(data) # print(data_list) print(" 当前进度:{:.2f}%".format((i * 100) / 139), end="") except: pass def html_parser(data_list, title_list): # 构造正则表达式,提取目标信息 for html in data_list: # print(html) html_li = '<li>(.*?)</li>' li_info = re.compile(html_li, re.S).findall(html) # print(li_info) # bs4配合正则使用 for j in li_info: soup = BeautifulSoup(j, 'html.parser') title = soup.div.string title_list.append(title) def main(): url = 'http://www.shandong.gov.cn/col/col2268/index.html?uid=6820&pageNum=' data_list = list() depth = 140 get_html_text(url, data_list, depth) title_list = list() html_parser(data_list, title_list) # 保存到本地路径 with open('sd_title.txt', 'a', encoding='utf-8') as f: data_raw = [str(i) for i in title_list] for i in data_raw: f.write(i + ' ') if __name__ == '__main__': main()