# coding=utf8 import requests import re #目标:获得所有搜狐首页上的链接文章,包含篮球2字的网页都保存到某个地方 #操作: # 1.获取网页源码 html = requests.get("http://www.sohu.com") #print(html.text) #获取网页源码 # 2.获取所有网页url links = re.findall(r'href="(.*?)"',html.text) # for link in links: # print(link) # 3.过滤url,只保留网页url,去除图片、css、js等 valid_link = [] for link in links: if "sohu" not in link: continue if re.search(r'jpg|png|css|ico|tif|fig|mailto',link): continue if link.startswith("//"): valid_link.append("http:"+link) # print("http:"+link.strip()) else: valid_link.append(link.strip()) # print(link) # print(len(valid_link)) # 4.判断是否包含篮球2字,并保存 no = 0 for link in valid_link: r = requests.get(link) if "篮球" in r.text: with open("F:\workspace\API_test\Crawlers\links\%s.html"%no,"w",encoding="utf-8") as fp: fp.write(r.text) no+=1